def __init__(self, *args, **kwargs): # 与BDsearchUrlUtil交互要用的参数,指明网址 self.site = 'news.qq.com' # 需要在parse()中使用该url关联的'keyword'(自定义item属性), # 当然 scrapy.response 对象中是没有的, # 也不想改写个 response 对象的子类了,直接定义一个类属性 self.keyword = '' self.bd = BDsearchUrlUtil() self.slog = SpiderLogUtil() super().__init__(*args, **kwargs)
def __init__(self, *args, **kwargs): self.slog = SpiderLogUtil() super().__init__(*args, **kwargs)
class WeiboSpider(Spider): name = "weibo_spider" hotbase_url = "https://weibo.cn/search/mblog?" \ "hideSearchFrame=&keyword=#" \ "&advancedfilter=1&sort=hot&page=" base_url = "https://weibo.cn/search/mblog?" \ "keyword=#" \ "&sort=time&page=" custom_settings = { # 请将Cookie替换成你自己的Cookie 'CONCURRENT_REQUESTS': 16, 'DOWNLOAD_DELAY': 3, 'COOKIES_ENABLED':False, 'DEFAULT_REQUEST_HEADERS' : { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36', 'Cookie':'_T_WM=f7fc1975334e2610dd77c4a949caaa2e; __guid=78840338.2690867225963806000.1561098303' '394.3926; TMPTOKEN=xWotEi1ho4BsQadI1WKh50PW3wxeD0MXriFaU01sHfs7ddDfYkc6g8QC0brRQ2iI; SUB=_2A25' 'wCAggDeRhGeBM6lER8yzJzD2IHXVT8qhorDV6PUJbkdAKLUnFkW1NRRuUuWjxLqOlReVo19AJKlLVFf0K-Qcb; SUHB=0h1J3' 'h4MnoA8ye; SCF=AqNspA5hvpJAB-QOIpSEFOvS7uTz2C-xcjU2d4im-izONxHBJbLovO6aDcPk7st0qIDcNhWWOxTPgrhwE' 'NoLpoA.; SSOLoginState=1561098352; monitor_count=2' } } def __init__(self, *args, **kwargs): self.slog = SpiderLogUtil() super().__init__(*args, **kwargs) def close(self, reason): self.slog.spider_finish(self) super().close(self, reason) def start_requests(self): querystr = getattr(self, 'q', '中美贸易') self.querystr=querystr times = getattr(self, 't', 3) self.querystr = querystr self.times=times self.slog.spider_finish(self) # #此处设置微博图片保存路径 # folderpath = "e:\weibo" +querystr # if (not os.path.exists(folderpath)): # os.mkdir(folderpath) # folderpath = "e:\weibo" # if (not os.path.exists(folderpath)): # os.mkdir(folderpath) self.q=[] self.base_url=self.base_url.replace("#",querystr) self.hotbase_url = self.hotbase_url.replace("#", querystr) print("开始爬取微博,关键字为"+self.querystr+"第"+str(self.times)+"次") yield Request(url=self.hotbase_url+"1", callback=self.parse_tweet) yield Request(url=self.base_url + "1", callback=self.parse_tweet) # def parse_url(self,response): # if response.url.endswith('page=1'): # # 如果是第1页,一次性获取后面的所有页 # all_page = re.search(r' 1/(\d+)页', response.text) # if all_page: # all_page = all_page.group(1) # all_page = int(all_page) # print('获取到了页数',all_page) # if all_page>=99: # all_page=99 # for page_num in range(2,3): # page_url = response.url.replace( # 'page=1', 'page={}'.format(page_num)) # yield Request(url=page_url, callback=self.parse_url, # dont_filter=True, meta=response.meta) # """ # 解析本页的数据 # """ # tree_node = etree.HTML(response.body) # tweet_nodes = tree_node.xpath('//div[@class="c" and @id]') # for tweet_node in tweet_nodes: # tweet_repost_url = tweet_node.xpath( # './/a[contains(text(),"转发[")]/@href')[0] # user_tweet_id = re.search( # r'/repost/(.*?)\?uid=(\d+)', tweet_repost_url) # weibo_url = 'https://weibo.com/{}/{}'.format(user_tweet_id.group(2),user_tweet_id.group(1)) # yield Request(url=weibo_url,callback= self.parse_details, # dont_filter=True, meta=response.meta,args={'wait': 2}) def parse_details(self,response): body = response.body body = body.decode("utf-8") # print(body) selector =Selector(text=body) headandname = selector.xpath('//div[@class="face"]/a[@class="W_face_radius"]')[0] head=headandname.xpath("./img/@src").get() author_name=headandname.xpath("./@title").get() author_url=headandname.xpath("./@href").get() timeandfrom=selector.xpath('//div[@class="WB_from S_txt2"]')[0] posttime=timeandfrom.xpath("./a")[0].xpath("./@title").get() timeArray = time.strptime(posttime, "%Y-%m-%d %H:%M") created_at =time.strftime("%Y-%m-%d-%H-%M-%S", timeArray) crawl_time= str(int(time.time())) tool="" if len(timeandfrom.xpath("./a"))>1: tool=timeandfrom.xpath("./a")[1].xpath("./text()").get() content1 =selector.xpath('//div[@class="WB_text W_f14"]') content =content1.get() p = re.sub(r'<.*?>', '', content) content = re.sub(r' ', '', p).strip() location1='' location1 =content1.xpath('./a/i[@class="W_ficon ficon_cd_place"]') if location1: location=location1.xpath('../@title').get() content=content.rstirp(location) print(content,location) def parse_information(self, response): """ 抓取个人信息 """ body = response.body body = body.decode("utf-8") selector = Selector(text=body) head_url = selector.xpath('//img[@alt="头像"]//@src').get() information_item = InformationItem() information_item["head"] =head_url information_item['crawl_time'] = int(time.time()) selector = Selector(response) information_item['_id'] = response.meta['id'] # 获取标签里的所有text() text1 = ";".join(selector.xpath('body/div[@class="c"]//text()').extract()) nick_name = re.findall('昵称;?[::]?(.*?);', text1) gender = re.findall('性别;?[::]?(.*?);', text1) place = re.findall('地区;?[::]?(.*?);', text1) briefIntroduction = re.findall('简介;?[::]?(.*?);', text1) birthday = re.findall('生日;?[::]?(.*?);', text1) sex_orientation = re.findall('性取向;?[::]?(.*?);', text1) sentiment = re.findall('感情状况;?[::]?(.*?);', text1) vip_level = re.findall('会员等级;?[::]?(.*?);', text1) authentication = re.findall('认证;?[::]?(.*?);', text1) labels = re.findall('标签;?[::]?(.*?)更多>>', text1) if nick_name and nick_name[0]: information_item["nick_name"] = nick_name[0].replace(u"\xa0", "") if gender and gender[0]: information_item["gender"] = gender[0].replace(u"\xa0", "") if place and place[0]: place = place[0].replace(u"\xa0", "").split(" ") information_item["province"] = place[0] if len(place) > 1: information_item["city"] = place[1] if briefIntroduction and briefIntroduction[0]: information_item["brief_introduction"] = \ briefIntroduction[0].replace(u"\xa0", "") if birthday and birthday[0]: information_item['birthday'] = birthday[0] if sex_orientation and sex_orientation[0]: if sex_orientation[0].replace(u"\xa0", "") == gender[0]: information_item["sex_orientation"] = "同性恋" else: information_item["sex_orientation"] = "异性恋" if sentiment and sentiment[0]: information_item["sentiment"] = sentiment[0].replace(u"\xa0", "") if vip_level and vip_level[0]: information_item["vip_level"] = vip_level[0].replace(u"\xa0", "") if authentication and authentication[0]: information_item["authentication"] = authentication[0].replace(u"\xa0", "") if labels and labels[0]: information_item["labels"] = \ labels[0].replace(u"\xa0", ",").replace(';', '').strip(',') #yield information_item request_meta = response.meta request_meta['item'] = information_item yield Request("https://weibo.cn/u/"+information_item['_id'], callback=self.parse_further_information, meta=request_meta, dont_filter=True, priority=1) def parse_further_information(self, response): text = response.text # print(text) information_item = response.meta['item'] tweets_num = re.findall('微博\[(\d+)\]', text) if tweets_num: information_item['tweets_num'] = int(tweets_num[0]) follows_num = re.findall('关注\[(\d+)\]', text) if follows_num: information_item['follows_num'] = int(follows_num[0]) fans_num = re.findall('粉丝\[(\d+)\]', text) if fans_num: information_item['fans_num'] = int(fans_num[0]) yield information_item # # 获取该用户微博 # yield Request(url=self.base_url + # '/{}/profile?page=1'.format(information_item['_id']), # callback=self.parse_tweet, # priority=1) # # # 获取关注列表 # yield Request(url=self.base_url + # '/{}/follow?page=1'.format(information_item['_id']), # callback=self.parse_follow, # dont_filter=True) # # 获取粉丝列表 # yield Request(url=self.base_url + # '/{}/fans?page=1'.format(information_item['_id']), # callback=self.parse_fans, # dont_filter=True) def parse_tweet(self, response): # body = response.body # body = body.decode("utf-8") # print(body) if response.url.endswith('page=1'): # 如果是第1页,一次性获取后面的所有页 all_page = re.search(r' 1/(\d+)页', response.text) if all_page: all_page = all_page.group(1) all_page = int(all_page) print('获取到了页数',all_page) if all_page>=99: all_page=99 for page_num in range(2,30): page_url = response.url.replace( 'page=1', 'page={}'.format(page_num)) yield Request(page_url, self.parse_tweet, dont_filter=True, meta=response.meta) """ 解析本页的数据 """ tree_node = etree.HTML(response.body) tweet_nodes = tree_node.xpath('//div[@class="c" and @id]') for tweet_node in tweet_nodes: try: tweet_item = TweetsItem() tweet_item['keyword']=self.querystr tweet_item['crawl_time'] =[] tweet_item['crawl_time'].append(str(int(time.time()))) tweet_repost_url = tweet_node.xpath( './/a[contains(text(),"转发[")]/@href')[0] user_tweet_id = re.search( r'/repost/(.*?)\?uid=(\d+)', tweet_repost_url) tweet_item['weibo_url'] = \ 'https://weibo.com/{}/{}'.format(user_tweet_id.group(2), user_tweet_id.group(1)) tweet_item['user_id'] = user_tweet_id.group(2) tweet_item['_id'] = '{}_{}'.format(user_tweet_id.group(2), user_tweet_id.group(1)) create_time_info_node = tweet_node.xpath('.//span[@class="ct"]')[-1] create_time_info = create_time_info_node.xpath('string(.)') if "来自" in create_time_info: tweet_item['created_at'] = \ time_fix(create_time_info.split('来自')[0].strip()) tweet_item['tool'] = create_time_info.split('来自')[1].strip() else: tweet_item['created_at'] = time_fix(create_time_info.strip()) like_num = tweet_node.xpath('.//a[contains(text(),"赞[")]/text()')[-1] tweet_item['like_num'] = [] tweet_item['like_num'].append( int(re.search('\d+', like_num).group())) repost_num = tweet_node.xpath('.//a[contains(text(),"转发[")]/text()')[-1] tweet_item['repost_num'] = [] tweet_item['repost_num'].append(int(re.search('\d+', repost_num).group())) comment_num = tweet_node.xpath( './/a[contains(text(),"评论[") ' 'and not(contains(text(),"原文"))]/text()')[-1] tweet_item['comment_num'] = [] tweet_item['comment_num'].append(int(re.search('\d+', comment_num).group())) images = tweet_node.xpath('.//img[@alt="图片"]/@src') if images: tweet_item['image_url'] = images[0] videos = tweet_node.xpath('.//a[contains(@href,' '"https://m.weibo.cn/s/video/show?object_id=")]' '/@href') if videos: tweet_item['video_url'] = videos[0] map_node = tweet_node.xpath('.//a[contains(text(),"显示地图")]') if map_node: map_node = map_node[0] map_node_url = map_node.xpath('./@href')[0] map_info = re.search(r'xy=(.*?)&', map_node_url).group(1) tweet_item['location_map_info'] = map_info tweet_item['location'] = \ map_node.xpath('./preceding-sibling::a/text()')[0] repost_node = tweet_node.xpath('.//a[contains(text(),"原文评论[")]/@href') if repost_node: tweet_item['origin_weibo'] = repost_node[0] # 检测由没有阅读全文: # all_content_link = # tweet_node.xpath('.//a[text()="全文" and contains(@href,"ckAll=1")]') # if all_content_link: # all_content_url = # self.base_url + all_content_link[0].xpath('./@href')[0] # yield Request(all_content_url, # callback=self.parse_all_content, # meta={'item': tweet_item}, # priority=1) # # else: tweet_html = etree.tostring(tweet_node, encoding='unicode') tweet_item['content'] = extract_weibo_content(tweet_html) if not tweet_item['_id'] in self.q: self.q.append(tweet_item['_id']) yield tweet_item if self.times==0: # 抓取该微博的评论信息 comment_url = 'https://weibo.cn/comment/hot/' \ + tweet_item['weibo_url'].split('/')[-1] + '?rl=2' # print(comment_url) yield Request(url=comment_url, callback=self.parse_comment, meta={'weibo_url': tweet_item['weibo_url']}) except Exception as e: self.logger.error(e) def parse_all_content(self, response): # 有阅读全文的情况,获取全文 body = response.body body = body.decode("utf-8", "ignore") # print(body) response.replace(body=body) tree_node = etree.HTML(response.body) tweet_item = response.meta['item'] content_node = tree_node.xpath('//*[@id="M_"]/div[1]')[0] tweet_html = etree.tostring(content_node, encoding='unicode') tweet_item['content'] = extract_weibo_content(tweet_html) self.q.append(tweet_item['']) yield tweet_item def parse_follow(self, response): """ 抓取关注列表 """ # 如果是第1页,一次性获取后面的所有页 if response.url.endswith('page=1'): all_page = re.search(r'/> 1/(\d+)页</div>', response.text) if all_page: all_page = all_page.group(1) all_page = int(all_page) for page_num in range(2, all_page + 1): page_url = response.url.replace('page=1', 'page={}'.format(page_num)) yield Request(page_url, self.parse_follow, dont_filter=True, meta=response.meta) selector = Selector(response) urls = selector.xpath('//a[text()="关注他" or text()="关注她" ' 'or text()="取消关注"]/@href').extract() uids = re.findall('uid=(\d+)', ";".join(urls), re.S) ID = re.findall('(\d+)/follow', response.url)[0] for uid in uids: relationships_item = RelationshipsItem() relationships_item['crawl_time'] = int(time.time()) relationships_item["fan_id"] = ID relationships_item["followed_id"] = uid relationships_item["_id"] = ID + '-' + uid yield relationships_item def parse_fans(self, response): """ 抓取粉丝列表 """ # 如果是第1页,一次性获取后面的所有页 if response.url.endswith('page=1'): all_page = re.search(r'/> 1/(\d+)页</div>', response.text) if all_page: all_page = all_page.group(1) all_page = int(all_page) for page_num in range(2, all_page + 1): page_url = response.url.replace('page=1', 'page={}'.format(page_num)) yield Request(page_url, self.parse_fans, dont_filter=True, meta=response.meta) selector = Selector(response) urls = selector.xpath('//a[text()="关注他" or text()="关注她" ' 'or text()="移除"]/@href').extract() uids = re.findall('uid=(\d+)', ";".join(urls), re.S) ID = re.findall('(\d+)/fans', response.url)[0] for uid in uids: relationships_item = RelationshipsItem() relationships_item['crawl_time'] = int(time.time()) relationships_item["fan_id"] = uid relationships_item["followed_id"] = ID relationships_item["_id"] = uid + '-' + ID yield relationships_item def parse_comment(self, response): # 如果是第1页,一次性获取后面的所有页 # if response.url.endswith('page=1'): # all_page = re.search(r'/> 1/(\d+)页</div>', response.text) # if all_page: # all_page = all_page.group(1) # all_page = int(all_page) # for page_num in range(2, all_page + 1): # page_url = response.url.replace('page=1', 'page={}'.format(page_num)) # yield Request(page_url, self.parse_comment, # dont_filter=True, meta=response.meta) body = response.body body = body.decode("utf-8") # print(body) response.replace(body=body) # print(response.body) tree_node = etree.HTML(response.body) comment_nodes = tree_node.xpath('//div[@class="c" and contains(@id,"C_")]') for comment_node in comment_nodes: comment_user_url = comment_node.xpath('.//a[contains(@href,"/u/")]/@href') if not comment_user_url: continue comment_item = CommentItem() comment_item['crawl_time'] = int(time.time()) comment_item['weibo_url'] = response.meta['weibo_url'] comment_item['comment_user_id'] = \ re.search(r'/u/(\d+)', comment_user_url[0]).group(1) comment_item['content'] = \ extract_comment_content(etree.tostring(comment_node, encoding='unicode')) comment_item['_id'] = comment_node.xpath('./@id')[0] created_at_info = comment_node.xpath('.//span[@class="ct"]/text()')[0] like_num = comment_node.xpath('.//a[contains(text(),"赞[")]/text()')[-1] comment_item['like_num'] = int(re.search('\d+', like_num).group()) comment_item['created_at'] = time_fix(created_at_info.split('\xa0')[0]) people_url='https://weibo.cn/'+comment_item['comment_user_id']+'/info' yield comment_item yield Request(people_url, self.parse_information ,meta={"id":comment_item['comment_user_id']} ) def parse_head(self, response): body = response.body body = body.decode("utf-8") selector = Selector(text=body) head_url=selector.xpath('//img[@alt="头像"]//@src').get() item =response.meta item['head_url']=head_url print(type(item),item) yield item
class QQNewsSpider(scrapy.Spider): name = 'qqnews_spider' def __init__(self, *args, **kwargs): # 与BDsearchUrlUtil交互要用的参数,指明网址 self.site = 'news.qq.com' # 需要在parse()中使用该url关联的'keyword'(自定义item属性), # 当然 scrapy.response 对象中是没有的, # 也不想改写个 response 对象的子类了,直接定义一个类属性 self.keyword = '' self.bd = BDsearchUrlUtil() self.slog = SpiderLogUtil() super().__init__(*args, **kwargs) def close(self, reason): # 当爬虫停止时,调用clockoff()修改数据库 self.slog.spider_finish(self) if self.bd.clockoff(self.site, self.keyword): self.logger.info('qqnews_spider clock off successful') super().close(self, reason) def start_requests(self): # get params (from console command) when be started self.keyword = getattr(self, 'q', None) if self.keyword is None: self.keyword = '中美贸易' self.slog.spider_start(self) # get url list for mongoDB urllist = self.bd.getNewUrl(self.site, self.keyword) # if no new url or error, urllist=None if urllist: for url in urllist: yield scrapy.Request(url, self.parse) # # test news_qq spider # url = 'https://news.qq.com/a/20170823/002257.htm' # yield scrapy.Request(url, self.parse) def parse(self, response): item = QQNewsItem() # 两排版通用 item['url'] = response.url item['crawl_time'] = getCurrentTime() item['title'] = response.xpath('//div[@class=\'hd\']/h1/text()').get() item['keyword'] = self.keyword # 正文抽取 content = '' for paragraph in response.xpath( '//div[@id=\'Cnt-Main-Article-QQ\']/p/text()'): paragraph = paragraph.get().strip() paragraph = re.sub(r'<[^i].*?>', '', paragraph) paragraph = re.sub(r'\(function[\s\S]+?\}\)\(\);', '', paragraph) content = content + paragraph item['content'] = content # 如果有正文,是新闻,没有,不是 # 关于发布时间和发布来源的布局我快疯了,区区10年变了好多次布局,要一个个定制 if content: # 发布时间 item['time'] = self.trygetPublishTime(response) # 发布来源 item['source'] = self.trygetPublishSource(response) yield item else: pass @staticmethod def trygetPublishTime(response): time = response.xpath('//span[@class=\'a_time\']/text()').get() if not time: time = response.xpath( '//div[@class=\'hd\']/div[@bosszone=\'titleDown\']' '//span[@class=\'article-time\']/text()').get() if not time: time = response.xpath('//div[@class=\'info\']/text()').get() if not time: time = response.xpath('//span[@class=\'pubTime\']/text()').get() # 如果时间拿到,格式化时间 # 三种原格式: # 2011年07月12日10:33 # 2011年07月12日 10:33 # 2017-08-23 06:30 # 格式化为: # 2017-08-23-06-30-00 if time: timefmt = formatTimeStr(time) if timefmt: return timefmt else: return time else: return None @staticmethod def trygetPublishSource(response): source = response.xpath('//span[@class=\'a_source\']/a/text()').get() if not source: source = response.xpath('//span[@class=\'a_source\']/text()').get() if not source: source = response.xpath('//span[@class=\'where\']/text()').get() if not source: source = response.xpath('//span[@class=\'where\']/a/text()').get() if not source: source = response.xpath( '//span[@class=\'color-a-1\']/a/text()').get() if not source: source = response.xpath( '//span[@class=\'color-a-1\']/text()').get() return source
class China(Spider): name = "chinanews_spider" base_url = "http://sou.chinanews.com/search.do" custom_settings = { 'DEFAULT_REQUEST_HEADERS': { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36', }, 'CONCURRENT_REQUESTS': 15, 'DOWNLOAD_DELAY': 2 } def __init__(self, *args, **kwargs): self.slog = SpiderLogUtil() super().__init__(*args, **kwargs) def close(self, reason): self.slog.spider_finish(self) super().close(self, reason) def start_requests(self): querystr = getattr(self, 'q', None) if not querystr: querystr = '中美贸易' self.querystr = querystr self.q = [] # folderpath ='E:\chinanews' + querystr # if (not os.path.exists(folderpath)): # os.mkdir(folderpath) self.slog.spider_start(self) my_data = { 'q': querystr, 'ps': '10', 'start': '0', 'type': '', 'sort': 'pubtime', 'time_scope': str(0), 'channel': 'all', 'adv': str(1), 'day1': '', 'day2': '', 'field': '', 'creator': '' } yield FormRequest( formdata=my_data, url=self.base_url, callback=self.parsefornum, ) def parsefornum(self, response): body = response.body body = body.decode("utf-8") response.replace(body=body) pre = re.compile(r'ongetkey\((\d+)\).?>尾页') num = pre.findall(str(body)) num = int(num[0]) print(num) if num > 400: num = 400 for i in range(num): q = i * 10 my_data = { 'q': self.querystr, 'ps': '10', 'start': str(q), 'type': '', 'sort': 'pubtime', 'time_scope': str(0), 'channel': 'all', 'adv': str(1), 'day1': '', 'day2': '', 'field': '', 'creator': '' } yield FormRequest( formdata=my_data, url=self.base_url, callback=self.parse, ) def parse(self, response): body = response.body body = body.decode("utf-8") response.replace(body=body) for div in response.xpath('//td/ul/li[@class="news_title"]/a/@href'): url = div.extract() yield Request(url=url, callback=self.parse2) def parse2(self, response): body = response.body body = body.decode("utf-8", "ignore") response.replace(body=body) item = ChinaNewsItem() title = response.xpath('//div[@class="content"]/h1/text()').get() if title: title = title.strip() imgs = [] content = '' ire = re.compile(r'src=\"(.+?)\"') pre = re.compile(r'<img[\s\S]+?>') url = response.url for p in response.xpath('//div[@class="left_zw"]').extract(): p = re.sub(r'<[^i].*?>', '', p) p = re.sub(r'\(function[\s\S]+?\}\)\(\);', '', p) q = pre.findall(p) for i in q: imgs.append(ire.findall(i)[0]) p = p.replace( i, '&&此处有图片,url:' + imgs[-1] + ",存储名为:" + (url.split('/')[-1]) + imgs[-1].split('/')[-1] + '&&') content = content + p.strip() timeandsource = response.xpath( '//div[@class="left-t"]/text()').get().strip() ts = timeandsource.split('来源') item['crawl_time'] = str(int(time.time())) created_time = ts[0].strip() timeArray = time.strptime(created_time, "%Y年%m月%d日 %H:%M") otherStyleTime = time.strftime("%Y-%m-%d-%H-%M-%S", timeArray) item['source'] = '中国新闻' if len(ts) > 1: source = ts[1] item['source'] = source item['keyword'] = self.querystr item['title'] = title item['content'] = content.replace("\r", "").replace("\n", "") item['time'] = otherStyleTime item['url'] = url item['imgs'] = imgs yield item # if __name__ == "__main__": # process = CrawlerProcess(get_project_settings()) # process.crawl('chinanews_spider') # process.start()
class BaiduSearchSpider(scrapy.Spider): name = 'baidu_search_spider' def __init__(self, *args, **kwargs): self.slog = SpiderLogUtil() super().__init__(*args, **kwargs) def close(self, reason): self.slog.spider_finish(self) super().close(self, reason) def start_requests(self): # get params (from console command) when be started querystr = getattr(self, 'q', None) site = getattr(self, 'site', None) if querystr is None: querystr = '中美贸易' if site is None: site = 'news.qq.com' self.slog.spider_start(self) url = self.baidusearchurlGen(querystr, site, 0) yield scrapy.Request(url, self.parse) def parse(self, response): # if no result, quit spider if response.xpath('//div[@class=\'content_none\']'): return # ===get info from every result=== for oneresult in response.xpath('//div[@class=\'result c-container \']\ /h3/a/@href'): item = BaiduSearchItem() item['url'] = oneresult.get() item['crawl_time'] = getCurrentTime() item['site'] = self.getOrigSiteUrl(response.url) item['waste'] = False item['keyword'] = self.getOrigKeyword(response.url) yield item # ===crawl next page, if exist=== # pn = page number currentpn = response.xpath('//div[@id=\'page\']/\ strong/span[@class=\'pc\']/text()') if currentpn: currentpn = int(currentpn[0].get()) maxpn = response.xpath('//div[@id=\'page\']/a\ /span[@class=\'pc\']/text()') if maxpn: maxpn = int(maxpn[-1].get()) nextpn = None # if so, exist one page num bigger than current page num if (currentpn and maxpn and (maxpn > currentpn)): nextpn = currentpn + 1 if nextpn: # get '...&pn=' sub string pncharindex = re.search('&pn=', response.url).span()[1] nexturl = response.url[:pncharindex] + str((nextpn - 1) * 10) yield response.follow(nexturl, self.parse) ''' 百度搜索的url的构建函数,以作为种子url列表供爬虫使用 exm: http://www.baidu.com/s?wd="中美贸易" site%3Anews.qq.com&pn=0 原查询词: ("中美贸易" site:news.qq.com) pn: page number,本页第一条结果在结果排行中的位置。 pn = ${结果页码-1}*rn 但rn参数(每页结果条数)已经反应不正常了, 所以就是默认每页十条。 @ param {string} querystr 查询字符串 exm:中美贸易 @ param {string} site exm:news.qq.com 借用百度的site搜索属性搜索某站内 @ param {string} pagenumber 本页第一条结果,=pn @ return {string} exm: https://www.baidu.com/s? wd="中美贸易" site%3Anews.qq.com&pn=0 ''' @staticmethod def baidusearchurlGen(querystr, site, pagenumber): # 注意https 有一个防爬虫机制,脚本加载真正数据,只能爬个壳。 return "http://www.baidu.com/s?wd=\"" \ + querystr + "\" site:" + site + "&pn=" + str(pagenumber) ''' 获取搜索时的原站点网址 exm: news.qq.com @ param {string} resurl response.url exm: https://www.baidu.com/s? wd="中美贸易" site%3Anews.qq.com&pn=0 @ return {string} ''' @staticmethod def getOrigSiteUrl(resurl): # 'site%3A{site domain}&pn' index = re.search('site:.*&pn', resurl).span() # '{site domain}' return resurl[(index[0] + 5):(index[1] - 3)] ''' 获取搜索时的关键字 exm: ‘中美贸易’ @ param {string} resurl response.url exm: https://www.baidu.com/s? wd=中美贸易 site%3Anews.qq.com&pn=0 @ return {string} ''' @staticmethod def getOrigKeyword(resurl): # 's?wd="{keyword}" site' index = re.search('wd=%22.*%22%20', resurl).span() # '{keyword} in url(ascii for url), decode' return urllib.parse.unquote(resurl[(index[0] + 6):(index[1] - 6)])
class SinaNewsSpider(scrapy.Spider): name = 'sinanews_spider' def __init__(self, *args, **kwargs): # 与BDsearchUrlUtil交互要用的参数,指明网址 self.site = 'news.sina.com.cn' # 需要在parse()中使用该url关联的'keyword'(自定义item属性), # 当然 scrapy.response 对象中是没有的, # 也不想改写个 response 对象的子类了,直接定义一个类属性 self.keyword = '' self.bd = BDsearchUrlUtil() self.slog = SpiderLogUtil() super().__init__(*args, **kwargs) def close(self, reason): self.slog.spider_finish(self) # 当爬虫停止时,调用clockoff()修改数据库 if self.bd.clockoff(self.site, self.keyword): self.logger.info('sinanews_spider clock off successful') super().close(self, reason) def start_requests(self): # get params (from console command) when be started self.keyword = getattr(self, 'q', None) if self.keyword is None: self.keyword = '中美贸易' self.slog.spider_start(self) # get url list for mongoDB urllist = self.bd.getNewUrl(self.site, self.keyword) # if no new url or error, urllist=None if urllist: for url in urllist: yield scrapy.Request(url, self.parse) # # test spider # url = 'http://news.sina.com.cn/c/2019-06-24' \ # '/doc-ihytcitk7355640.shtml' # yield scrapy.Request(url, self.parse) def parse(self, response): item = SinaNewsItem() item['url'] = response.url item['crawl_time'] = getCurrentTime() item['keyword'] = self.keyword title = response.xpath('//title/text()').get() if title: title = title.replace('_新浪新闻', '') title = title.replace('_新浪网', '') title = title.replace('_新浪军事', '') title = title.replace('_新闻中心', '') item['title'] = title item['time'] = self.trygetPublishTime(response) item['source'] = self.trygetPublishSource(response) # 正文抽取 item['content'] = self.trygetContent(response) yield item @staticmethod def trygetPublishTime(response): time = response.xpath('//div[@class=\'date-source\']' '/span[@class=\'date\']/text()').get() # http://news.sina.com.cn/o/2017-07-07/doc-ifyhvyie0474852.shtml # http://mil.news.sina.com.cn/china/2016-04-14/ # doc-ifxriqqx2384948.shtml if not time: if response.xpath('//span[@class=\'time-source\']' '//span[@class=\'titer\']'): time = response.xpath('//span[@class=\'time-source\']' '//span[@class=\'titer\']/text()').get() else: time = response.xpath( '//span[@class=\'time-source\']/text()').get() if time: time = re.sub(r'<[^i].*?>', '', time) if time: timefmt = formatTimeStr(time) if timefmt: return timefmt else: return time else: return None @staticmethod def trygetPublishSource(response): source = response.xpath('//div[@class=\'date-source\']' '/a[@class=\'source\']/text()').get() # http://news.sina.com.cn/o/2017-07-07/doc-ifyhvyie0474852.shtml if not source: source = response.xpath('//div[@class=\'time-source\']' '//a/text()').get() # http://mil.news.sina.com.cn/china/ # 2016-04-14/doc-ifxriqqx2384948.shtml if not source: source = response.xpath('//span[@class=\'time-source\']' '//span[@class=\'source\']' '/text()').get() return source @staticmethod def trygetContent(response): content = '' def paragraph_process(paragraph): p = paragraph.get().strip() p = re.sub(r'<[^i].*?>', '', p) p = re.sub(r'\(function[\s\S]+?\}\)\(\);', '', p) return p # /a/ /c/ doc-... /o/ if response.xpath('//div[@id=\'article\']//p/text()'): for paragraph in response.xpath( '//div[@id=\'article\']//p/text()'): content = content + paragraph_process(paragraph) # some of /o/ # http://news.sina.com.cn/o/2019-05-14/doc-ihvhiews1782968.shtml elif response.xpath('//div[@id=\'article\']//p/text()'): for paragraph in response.xpath( '//div[@id=\'article\']//div/text()'): content = content + paragraph_process(paragraph) # http://news.sina.com.cn/o/2017-07-07/doc-ifyhvyie0474852.shtml elif response.xpath('//div[@id=\'artibody\']//p/text()'): for paragraph in response.xpath( '//div[@id=\'artibody\']//p/text()'): content = content + paragraph_process(paragraph) return content