class QidianNovelsItem(Item): target_item = TextField(css_select='ul.all-img-list>li') novel_url = AttrField(css_select='div.book-img-box>a', attr='href') novel_name = TextField(css_select='div.book-mid-info>h4') novel_author = TextField(css_select='div.book-mid-info>p.author>a.name') novel_author_home_url = AttrField( css_select='div.book-mid-info>p.author>a.name', attr='href') novel_type = TextField( css_select='div.book-mid-info > p.author > a:nth-child(4)') novel_cover = AttrField(css_select='div.book-img-box img', attr='src') novel_abstract = TextField(css_select='div.book-mid-info p.intro') # novel_latest_chapter = TextField(css_select='div.bookupdate a') async def clean_novel_url(self, novel_url): return 'https:' + novel_url async def clean_novel_author(self, novel_author): if isinstance(novel_author, list): novel_author = novel_author[0].text return novel_author async def clean_novel_author_home_url(self, novel_author_home_url): if isinstance(novel_author_home_url, list): novel_author_home_url = novel_author_home_url[0].get( 'href').strip() return 'https:' + novel_author_home_url async def clean_novel_cover(self, novel_cover): return 'https:' + novel_cover
class FictionItem(Item): target_item = TextField(css_select="dd") title = TextField(css_select='a') url = AttrField(css_select='a', attr='href') async def clean_title(self, value): return value
class HackerNewsItem(Item): target_item = TextField(css_select='tr.athing') title = TextField(css_select='a.storylink') url = AttrField(css_select='a.storylink', attr='href') async def clean_title(self, value): return value
class MyItem(Item): title = TextField(css_select='.title') star = TextField(css_select='.star') tags = TextField(css_select='.tag', many=True) async def clean_star(self, value): return int(value)
class MyItem(Item): title = TextField(css_select=".title") star = TextField(css_select=".star") tags = TextField(css_select=".tag", many=True) async def clean_star(self, value): return int(value)
class MyItem(Item): target_item = TextField(css_select=".movie") title = TextField(css_select=".title") star = TextField(css_select=".star") async def clean_star(self, value): return int(value)
class MyItem(Item): target_item = TextField(css_select='.movie') title = TextField(css_select='.title') star = TextField(css_select='.star') async def clean_star(self, value): return int(value)
class JianshuItem(Item): target_item = TextField(css_select="ul.list>li") author_name = TextField(css_select="a.name") author_url = AttrField(attr="href", css_select="a.name") async def clean_author_url(self, author_url): return f"https://www.jianshu.com{author_url}"
class ArticleListItem(Item): """ eg: http://www.ruanyifeng.com/blog/essays/ """ target_item = TextField(css_select='div#alpha-inner li.module-list-item') title = TextField(css_select='li.module-list-item>a') href = AttrField(css_select='li.module-list-item>a', attr='href')
class HackerNewsItem(Item): target_item = TextField(css_select="tr.athing") title = TextField(css_select="a.storylink") url = AttrField(css_select="a.storylink", attr="href") async def clean_title(self, value): return value.strip()
class FishItem(Item): target_item = TextField(css_select='div.m_search_list dl') title = TextField(css_select='h2 a') date = TextField(css_select='dd.search_laiyuan') url = AttrField(css_select='h2 a', attr='href') async def clean_date(self, value): date = value.replace('发布时间:', '') return date
class Data258WechatItem(Item): """ 微阅读公众号搜索一级页面信息提取 示例:https://mp.data258.com/mp/search?type=category&key=老胡的储物柜&sort= """ target_item = TextField(css_select="div.layui-panel") wechat_name = TextField(css_select="h2>a", default="") wehcat_href = AttrField(css_select="h2>a", attr="href", default="")
class MyItem(Item): target_item = TextField(css_select='.movie') title = TextField(css_select=".title") star = TextField(css_select=".star") @staticmethod async def clean_title(value): if not value: raise IgnoreThisItem return value
class FishItem(Item): target_item = TextField(css_select='td [width="530"]') title = TextField(css_select='a') date = TextField(css_select='td') url = AttrField(css_select='a', attr='href') async def clean_date(self, value): date = value[-10:] date = date.replace('.', '-') return date
class DoubanItem(Item): target_item = TextField(css_select="div.item") title = TextField(css_select="span.title") cover = AttrField(css_select="div.pic>a>img", attr="src") abstract = TextField(css_select="span.inq", default="") async def clean_title(self, title): if isinstance(title, str): return title else: return "".join([i.text.strip().replace("\xa0", "") for i in title])
class DoubanItems(Item): target_item = TextField(css_select='div.item') title = TextField(css_select='span.title') cover = AttrField(css_select='div.pic>a>img', attr='src') abstract = TextField(css_select='span.inq') async def clean_title(self, title): if isinstance(title, str): return title else: return ''.join([i.text.strip().replace('\xa0', '') for i in title])
class FishItem(Item): target_item = TextField(css_select='#info ') title = TextField(css_select='h2 a') date = TextField(css_select='dd.search_laiyuan') url = AttrField(css_select='h2 a', attr='href') async def clean_date(self, value): date = value.split('(')[1] date = date.rstrip(')') print('date =', date) return date
class DiseaseHomeItem(Item): disease_name = TextField(css_select='h1.ti') disease_subject = TextField(css_select='div.table>div:nth-child(1)>span') disease_ask_lists = TextField(css_select='div.ask_lists div.lists a', many=True) disease_ask_link_lists = AttrField(css_select='div.ask_lists div.lists a', attr='href', many=True) async def clean_disease_ask_link_lists(self, disease_ask_link_lists): return ['http:' + i for i in disease_ask_link_lists]
class RankingItem(Item): target_item = TextField(css_select='.rank-list') ranking_title = AttrField(css_select='h3.wrap-title', attr='html') more = AttrField(css_select='h3>a.more', attr='href') book_list = TextField(css_select='div.book-list>ul>li') async def clean_ranking_title(self, ranking_title): if isinstance(ranking_title, list): return ranking_title[0].text async def clean_more(self, more): return "http:" + more
class DoubanItem(Item): """ 定义爬虫的目标字段 """ target_item = TextField(css_select='div.item') title = TextField(css_select='span.title') async def clean_title(self,title): if isinstance(title, str): return title else: return ''.join([i.text.strip().replace('\xa0', '') for i in title])
class PageItem(Item): target_item = TextField(css_select='ul.be-pager') count = TextField(css_select='span.be-pager-total') async def clean_count(self, value): nowpgCount = 1 pgc = re.findall(r"\d+\.?\d*", value) if pgc: nowpgCount = int(pgc[0]) else: raise Exception("Error:PageItem re.findall -> pageInfo.count") return nowpgCount
class RankingItem(Item): target_item = TextField(css_select='.rank-list') ranking_title = TextField(css_select='h3.wrap-title') more = AttrField(css_select='h3>a.more', attr='href') book_list = HtmlField(css_select='div.book-list>ul>li', many=True) async def clean_ranking_title(self, ranking_title): if isinstance(ranking_title, list): return ranking_title[0].text else: return str(ranking_title).split('榜')[0] + '榜' async def clean_more(self, more): return "https:" + more
class Data258WechatListItem(Item): """ 微阅读公众号历史文章信息提取 示例: https://mp.data258.com/article/category/howie_locker """ target_item = TextField(css_select="ul.jie-row>li") w_article_title = TextField(css_select="a.jie-title", default="") w_article_href = AttrField(css_select="a.jie-title", attr="href", default="") async def clean_w_article_title(self, value: list): """获取文章标题""" return str(value).strip() if value else ""
class HackerNewsItem(Item): """ 定义目标字段抓取规则 """ target_item = TextField(css_select='tr.athing') title = TextField(css_select='a.storylink') url = AttrField(css_select='a.storylink', attr='href') async def clean_title(self, value): """ 清洗目标数据 :param value: 初始目标数据 :return: """ return str(value).strip()
class MyItem(Item): """ 定义爬虫的目标字段 """ target_item = HtmlField(css_select='html') title = TextField(css_select='head title') article = HtmlField(css_select='article')
class FishItem(Item): target_item = TextField(css_select='div.jsearch-result-box') title = TextField(css_select='div.jsearch-result-title') date = TextField(css_select='span.jsearch-result-date') url = AttrField(css_select='div.jsearch-result-title a', attr='href') async def clean_date(self, value): date = value.rstrip('-') date = date.rstrip(' ') date = date.rstrip('日') date = date.replace('年', '-').replace('月', '-') return date async def clean_url(self, value): url = 'http://www.cast.org.cn' + value return url
class UKSearchItem(Item): target_item = TextField( css_select= "div[data-spm=PhoneSokuThreeProgram_4] > div.pack-cover_1K0Xq") title = AttrField(css_select='a.pack-top_2nSnm', attr='data-trackinfo') url = AttrField(css_select='a.pack-top_2nSnm', attr='href') img = AttrField(css_select='a.pack-top_2nSnm', attr='style')
class ArchivesItem(Item): """ eg: http://www.ruanyifeng.com/blog/archives.html """ target_item = TextField(css_select="div#beta-inner li.module-list-item") href = AttrField(css_select="li.module-list-item>a", attr="href")
class QidianNovelInfoItem(Item): """ 定义继承自item的Item类 """ novel_name = TextField(css_select='.book-info>h1>em') author = TextField(css_select='a.writer') # 当提取的值是属性的时候,要定义AttrField cover = AttrField(css_select='a#bookImg>img', attr='src') abstract = TextField(css_select='div.book-intro>p') status = TextField(css_select='p.tag>span.blue') novels_type = TextField(css_select='p.tag>a.red') latest_chapter = TextField(css_select='li.update>div.detail>p.cf>a') latest_chapter_time = TextField(css_select='div.detail>p.cf>em') async def clean_cover(self, cover): return 'http:' + cover async def clean_status(self, status): """ 当目标值的对象只有一个,默认将值提取出来,否则返回list,可以在这里定义一个函数进行循环提取 :param ele_tag: :return: """ return '#'.join([i.text for i in status]) async def clean_novels_type(self, novels_type): return '#'.join([i.text for i in novels_type]) async def clean_latest_chapter_time(self, latest_chapter_time): return latest_chapter_time.replace( u'今天', str(time.strftime("%Y-%m-%d ", time.localtime()))).replace( u'昨日', str( time.strftime("%Y-%m-%d ", time.localtime(time.time() - 24 * 60 * 60))))
class DiseaseItem(Item): disease_name = TextField(css_select='div.keshi_list>a', many=True) disease_url = AttrField(css_select='div.keshi_list>a', attr='href', many=True) async def clean_disease_url(self, disease_url): domain = 'https://m.120ask.com' return [urljoin(domain, i) for i in disease_url]