def parse(self, response): #forum parser for sel in response.xpath('//li[contains(@class, "j_thread_list")]'): data = json.loads(sel.xpath('@data-field').extract_first()) item = ThreadItem() item['id'] = data['id'] item['author'] = data['author_name'] item['reply_num'] = data['reply_num'] item['good'] = data['is_good'] if not item['good']: item['good'] = False item['title'] = sel.xpath( './/div[contains(@class, "threadlist_title")]/a/text()' ).extract_first() if self.filter and not self.filter( item["id"], item["title"], item['author'], item['reply_num'], item['good']): continue #filter过滤掉的帖子及其回复均不存入数据库 yield item meta = {'thread_id': data['id'], 'page': 1} url = 'http://tieba.baidu.com/p/%d' % data['id'] if self.see_lz: url += '?see_lz=1' yield scrapy.Request(url, callback=self.parse_post, meta=meta) next_page = response.xpath('//a[@class="next pagination-item "]/@href') self.cur_page += 1 if next_page: if self.cur_page <= self.end_page: yield self.make_requests_from_url('http:' + next_page.extract_first())
def parse(self, response): #forum parser print("Crawling page %d..." % self.cur_page) for sel in response.xpath('//li[contains(@class, "j_thread_list")]'): data = json.loads(sel.xpath('@data-field').extract_first()) item = ThreadItem() item['id'] = data['id'] item['author'] = data['author_name'] item['reply_num'] = data['reply_num'] item['good'] = data['is_good'] if not item['good']: item['good'] = False item['title'] = sel.xpath( './/div[contains(@class, "threadlist_title")]/a/@title' ).extract_first() if self.filter and not self.filter( item["id"], item["title"], item['author'], item['reply_num'], item['good']): continue #filter过滤掉的帖子及其回复均不存入数据库 yield item meta = {'thread_id': data['id'], 'page': 1} url = 'http://tieba.baidu.com/p/%d' % data['id'] if self.see_lz: url += '?see_lz=1' yield scrapy.Request( url, callback=self.parse_post, meta=meta, headers={ 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.130 Safari/537.36' }) next_page = response.xpath('//a[@class="next pagination-item "]/@href') self.cur_page += 1 if next_page: if self.cur_page <= self.end_page: yield self.make_requests_from_url('http:' + next_page.extract_first())
def parse(self, response): #forum parser for sel in response.xpath('//li[contains(@class, "j_thread_list")]'): data = json.loads(sel.xpath('@data-field').extract_first()) item = ThreadItem() item['id'] = data['id'] item['author'] = data['author_name'] item['reply_num'] = data['reply_num'] item['good'] = data['is_good'] if not item['good']: item['good'] = False item['title'] = sel.xpath( './/div[contains(@class, "threadlist_title")]/a/text()' ).extract_first() yield item meta = {'thread_id': data['id'], 'page': 1} url = 'http://tieba.baidu.com/p/%d' % data['id'] yield scrapy.Request(url, callback=self.parse_post, meta=meta) next_page = response.xpath('//a[@class="next pagination-item "]/@href') if next_page: self.max_page -= 1 if self.max_page > 0: yield self.make_requests_from_url(next_page.extract_first())
def parse(self, response): #forum parser pre_titleName = response.xpath('//head/title//text()').extract_first() print("head中的贴吧名:", pre_titleName) titleName = pre_titleName.split('-')[0] #贴吧name就是贴吧id titleId = pre_titleName.split('-')[0] #关注数 card_menNum = response.xpath( '//span[contains(@class, "card_numLabel")]') #发帖数 card_infoNum = response.xpath( '//span[contains(@class, "card_infoNum")]') print("关注数:{}".format(card_menNum)) print("发帖数:{}".format(card_infoNum)) tiebaInfo = TiebaInfo() #贴吧id 暂时跳转都是fw=贴吧名 tiebaInfo['outId'] = titleId #贴吧名和贴吧id是一致的 tiebaInfo['tiebaName'] = titleName #关注数 tiebaInfo['accountCount'] = card_menNum #帖子数 tiebaInfo['postCount'] = card_infoNum manager_groups = response.xpath( '//ul[contains(@class, "manager_groups aside_media_horizontal")]') print("管理员列表:{}".format(manager_groups)) noreferrer_name_list = manager_groups.xpath( '//a[@rel="noreferrer"]/@title').extract() print("吧主名字集合数据:{}".format(noreferrer_name_list)) noreferrer_name_str = ",".join(noreferrer_name_list) #暂时先获取对应的名字 tiebaInfo["managerIds"] = noreferrer_name_str yield tiebaInfo thread_list = response.xpath('//li[contains(@class, "j_thread_list")]') thread_author_list = response.xpath( '//span[contains(@class, "tb_icon_author")]') #这个时间不行格式有问题 # thread_create_time_list = response.xpath('//span[contains(@class, "is_show_create_time")]') for index, sel in enumerate(thread_list): data = json.loads(sel.xpath('@data-field').extract_first()) user_data = thread_author_list.xpath( '@data-field').extract()[index] # create_time = thread_create_time_list.extract()[index] print("用户id信息:{}".format(user_data)) print("用户名字:{}".format(data['author_name'])) print("用户id的data类型:{}".format(type(user_data))) author_data = json.loads(user_data) item = ThreadItem() # 贴吧账号id item['tiebaAccountId'] = author_data['user_id'] #外部内容id 帖子id item['outContentId'] = data['id'] # 贴吧名就是贴吧id item['tiebaInfoId'] = titleName # item['reply_num'] = data['reply_num'] # item['good'] = data['is_good'] print("用户详情页跳转标识:{}".format(data['author_portrait'])) # if not item['good']: # item['good'] = False from scrapy.shell import inspect_response # 内容 item['content'] = sel.xpath( './/div[contains(@class, "threadlist_title")]/a/@title' ).extract_first() if self.filter and not self.filter( item["id"], item["title"], item['author'], item['reply_num'], item['good']): continue print("filter的值:{}".format(filter)) #filter过滤掉的帖子及其回复均不存入数据库 url = 'http://tieba.baidu.com/p/%d' % data['id'] if self.see_lz: url += '?see_lz=1' #处理帖子发帖时间请求的 yield scrapy.Request(url, callback=self.parse_thread_time, cb_kwargs=dict(item)) #获取对应的详情页信息放入楼层处理 # print("调用用户详情.......................") # yield scrapy.Request(url, callback=self.parse_user_detail, meta=meta) next_page = response.xpath('//a[@class="next pagination-item "]/@href') self.cur_page += 1 if next_page: if self.cur_page <= self.end_page: yield self.make_requests_from_url('http:' + next_page.extract_first())
def parse_thread_time(self, response, tiebaAccountId, outContentId, tiebaInfoId, content): print( "1楼的传递参数: tiebaAccountId:{},outContentId:{} tiebaInfoId:{}".format( tiebaAccountId, outContentId, tiebaInfoId)) item = ThreadItem() # 贴吧账号id item['tiebaAccountId'] = tiebaAccountId # 外部内容id 帖子id item['outContentId'] = outContentId # 贴吧名就是贴吧id item['tiebaInfoId'] = tiebaInfoId #内容 item['content'] = content #请求一次url详情页url地址获取第一个楼层就是帖子本身 first_floor = response.xpath( "//div[contains(@class, 'l_post')]").extract_first() first_data_field = response.xpath( "//div[contains(@class, 'l_post')]/@data-field").extract_first() # print("first_floor的信息:{}".format(first_floor)) #获取第一个信息来获取发帖时间 # first_data_field = first_floor.xpath("//div/@data-field").extract_first() print("first_data_field的信息:{}".format(first_data_field)) first_floor_data = json.loads(first_data_field) #初始化时间 thread_time = None if 'date' in first_floor_data['content'].keys(): thread_time = first_floor_data['content']['date'] item['publishTime'] = thread_time # 只有以前的帖子, data-field里面才有date else: thread_time = first_floor.xpath(".//span[@class='tail-info']") \ .re_first(r'[0-9]{4}-[0-9]{2}-[0-9]{2} [0-9]{2}:[0-9]{2}') item['publishTime'] = thread_time print("时间:{}".format(thread_time)) #时间格式处理 thread_time = self.dealTime(thread_time) created_at = datetime.strptime(thread_time, "%Y-%m-%d %H:%M:%S") since_date = datetime.strptime(self.since_date, "%Y-%m-%d %H:%M:%S") send_Flag = created_at < since_date if send_Flag: item['isSend'] = False else: item['isSend'] = True yield item #帖子不发送了 楼层没必要获取了 if send_Flag: # yield item meta = { 'threadId': outContentId, 'tiebaInfoId': tiebaInfoId, 'page': 1, self.const_active_tieba: { tiebaInfoId: 0 } } print("跳转帖子对应的楼层详情页URL:{}".format(response.url)) yield scrapy.Request(response.url, callback=self.parse_post, meta=meta)