def parse_post(self, response): meta = response.meta has_comment = False for floor in response.xpath("//div[contains(@class, 'l_post')]"): if not helper.is_ad(floor): data = json.loads(floor.xpath("@data-field").extract_first()) item = PostItem() item['post_id'] = data['content']['post_id'] item['author'] = data['author']['user_name'] item['comment_num'] = data['content']['comment_num'] if item['comment_num'] > 0: has_comment = True content = floor.xpath( ".//div[contains(@class,'j_d_post_content')]" ).extract_first() #以前的帖子, data-field里面没有content item['content'] = helper.parse_content(content, True) images = helper.get_images(content, True) if len(images) > 0: for image in images: yield self.parse_image(image_url=image, post_id=item['post_id'], image_index=images.index(image)) #以前的帖子, data-field里面没有thread_id item['thread_id'] = meta['thread_id'] item['floor'] = data['content']['post_no'] if 'user_id' in data['author'].keys(): item['user_id'] = data['author']['user_id'] else: item['user_id'] = None #只有以前的帖子, data-field里面才有date if 'date' in data['content'].keys(): item['time'] = data['content']['date'] #只有以前的帖子, data-field里面才有date else: item['time'] = floor.xpath(".//span[@class='tail-info']")\ .re_first(r'[0-9]{4}-[0-9]{2}-[0-9]{2} [0-9]{2}:[0-9]{2}') user_uri = floor.xpath( ".//a[@class='p_author_name j_user_card']/@href" ).extract_first() if user_uri: url = 'http://tieba.baidu.com%s' % user_uri yield scrapy.Request(url, callback=self.parse_user) yield item if has_comment: url = "http://tieba.baidu.com/p/totalComment?tid=%d&fid=1&pn=%d" % ( meta['thread_id'], meta['page']) if self.see_lz: url += '&see_lz=1' yield scrapy.Request(url, callback=self.parse_comment, meta=meta) next_page = response.xpath( u".//ul[@class='l_posts_num']//a[text()='下一页']/@href") if next_page: meta['page'] += 1 url = response.urljoin(next_page.extract_first()) yield scrapy.Request(url, callback=self.parse_post, meta=meta)
def parse_post(self, response): meta = response.meta has_comment = False for floor in response.xpath("//div[contains(@class, 'l_post')]"): if not helper.is_ad(floor): data = json.loads(floor.xpath("@data-field").extract_first()) item = PostItem() item['id'] = data['content']['post_id'] item['author'] = data['author']['user_name'] item['comment_num'] = data['content']['comment_num'] if item['comment_num'] > 0: has_comment = True content = floor.xpath( ".//div[contains(@class,'j_d_post_content')]" ).extract_first() #以前的帖子, data-field里面没有content item['content'] = helper.parse_content(content) #以前的帖子, data-field里面没有thread_id item['thread_id'] = meta['thread_id'] item['floor'] = data['content']['post_no'] #只有以前的帖子, data-field里面才有date if 'date' in data['content'].keys(): item['time'] = data['content']['date'] #只有以前的帖子, data-field里面才有date else: item['time'] = floor.xpath(".//span[@class='tail-info']")\ .re_first(r'[0-9]{4}-[0-9]{2}-[0-9]{2} [0-9]{2}:[0-9]{2}') yield item if has_comment: url = "http://tieba.baidu.com/p/totalComment?tid=%d&fid=1&pn=%d" % ( meta['thread_id'], meta['page']) if self.see_lz: url += '&see_lz=1' yield scrapy.Request( url, callback=self.parse_comment, meta=meta, headers={ 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.130 Safari/537.36' }) next_page = response.xpath( u".//ul[@class='l_posts_num']//a[text()='下一页']/@href") if next_page: meta['page'] += 1 url = response.urljoin(next_page.extract_first()) yield scrapy.Request( url, callback=self.parse_post, meta=meta, headers={ 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.130 Safari/537.36' })
def parse_post(self, response): meta = response.meta has_comment = False for floor in response.xpath("//div[contains(@class, 'l_post')]"): if not helper.is_ad(floor): data = json.loads(floor.xpath("@data-field").extract_first()) item = PostItem() item['id'] = data['content']['post_id'] item['author'] = data['author']['user_name'] item['comment_num'] = data['content']['comment_num'] if item['comment_num'] > 0: has_comment = True content = floor.xpath( ".//div[contains(@class,'j_d_post_content')]" ).extract_first() #以前的帖子, data-field里面没有content item['content'] = helper.parse_content(content) #以前的帖子, data-field里面没有thread_id item['thread_id'] = meta['thread_id'] item['floor'] = data['content']['post_no'] #只有以前的帖子, data-field里面才有date if 'date' in data['content'].keys(): item['time'] = data['content']['date'] #只有以前的帖子, data-field里面才有date else: item['time'] = floor.xpath(".//span[@class='tail-info']")\ .re_first(r'\d{4}-\d{2}-\d{2} \d{2}:\d{2}') yield item if has_comment: url = "http://tieba.baidu.com/p/totalComment?tid=%d&fid=1&pn=%d" % ( meta['thread_id'], meta['page']) if self.see_lz: url += '&see_lz=1' yield scrapy.Request(url, callback=self.parse_totalComment, meta=meta, headers=self.my_headers) next_page = response.xpath( u".//ul[@class='l_posts_num']//a[text()='下一页']/@href") if next_page: meta['page'] += 1 url = response.urljoin(next_page.extract_first()) yield scrapy.Request(url, callback=self.parse_post, meta=meta, headers=self.my_headers)
def parse(self, response): # for sel in response.xpath("//ul[@id='thread_list']"): for sel in response.xpath("//li[contains(@class,'j_thread_list')]"): # li = sel.xpath("//li[@class='j_thread_list']") item = PostItem() try: item['title'] = self.extractInfo(sel.xpath(".//div[contains(@class,'threadlist_abs')]/text()").extract()) item['date'] = self.extractInfo(sel.xpath(".//span[contains(@class,'is_show_create_time')]/text()").extract()) item['url'] = self.extractInfo(sel.xpath(".//div[contains(@class,'threadlist_title pull_left j_th_tit')]").xpath("a/@href").extract()) item['author'] = self.extractInfo(sel.xpath(".//span[contains(@class,'tb_icon_author')]/@title").extract()) print(item['title'], item['date'], item['url'], item['author']) self.data.append(item) # self.saveToDb(item) except IndexError: continue self.currentIndex = self.currentIndex+50 if self.currentIndex > 1000: self.saveToDb(self.data) return nexturl = 'https://tieba.baidu.com/f?kw=clannad&ie=utf-8&pn='+str(self.currentIndex) yield self.make_requests_from_url(nexturl)
def parse_post(self, response): meta = response.meta has_comment = False # total_commont_floor_num = 0 # 递归调用时候增加总数 放在另一个服务里面去统计 # if meta.has_key('total_commont_floor_num'): # print("已经有了统计的总数:{}".format(meta['total_commont_floor_num'])) # total_commont_floor_num = meta['total_commont_floor_num'] floor_list = response.xpath("//div[contains(@class, 'l_post')]") #vip 用户会有 vip_red 在class 中 这边用contains # post_floor_userinfo_list = response.xpath('//a[contains(@class, "p_author_name")]//@href') # print("楼层用户的集合的长度:{}".format(len(post_floor_userinfo_list))) for index, floor in enumerate(floor_list): if not helper.is_ad(floor): # total_commont_floor_num +=1 data = json.loads(floor.xpath("@data-field").extract_first()) item = PostItem() # 楼层用户名 # item['author'] = data['author']['user_name'] # 楼层用户id item['tiebaAccountId'] = data['author']['user_id'] # 具体楼层评论数 用来处理楼中楼信息 content = floor.xpath( ".//div[contains(@class,'j_d_post_content')]" ).extract_first() # 以前的帖子, data-field里面没有content item['content'] = helper.parse_content(content) #楼层id item['outContentId'] = data['content']['post_id'] #贴吧id item['tiebaInfoId'] = meta['tiebaInfoId'] # TODO 获取用户逇跳转链接 可以直接拿去扒数据 # print("获取:{}".format(user_detail_href)) # item['user_detail_href'] = user_detail_href #以前的帖子, data-field里面没有 threadId item['threadId'] = meta['threadId'] #楼层信息 # item['floor'] = data['content']['post_no'] post_time = None #只有以前的帖子, data-field里面才有date if 'date' in data['content'].keys(): post_time = data['content']['date'] item['publishTime'] = post_time #只有以前的帖子, data-field里面才有date else: post_time = floor.xpath(".//span[@class='tail-info']")\ .re_first(r'[0-9]{4}-[0-9]{2}-[0-9]{2} [0-9]{2}:[0-9]{2}') item['publishTime'] = post_time # 时间格式处理 post_time = self.dealTime(post_time) created_at = datetime.strptime(post_time, "%Y-%m-%d %H:%M:%S") since_date = datetime.strptime(self.since_date, "%Y-%m-%d %H:%M:%S") if created_at < since_date: item['isSend'] = False else: item['isSend'] = True #处理评论 item['comment_num'] = data['content']['comment_num'] if item['comment_num'] > 0: has_comment = True yield item user_detail_href = "https://tieba.baidu.com/home/main?id=" + data[ 'author']['portrait'] #请求用户详情页 yield scrapy.Request(user_detail_href, callback=self.parse_user_detail, meta=meta, cb_kwargs=dict(item)) # meta['total_commont_floor_num'] = total_commont_floor_num if has_comment: url = "http://tieba.baidu.com/p/totalComment?tid=%d&fid=1&pn=%d" % ( meta['threadId'], meta['page']) if self.see_lz: url += '&see_lz=1' yield scrapy.Request(url, callback=self.parse_comment, meta=meta) next_page = response.xpath( u".//ul[@class='l_posts_num']//a[text()='下一页']/@href") if next_page: meta['page'] += 1 url = response.urljoin(next_page.extract_first()) yield scrapy.Request(url, callback=self.parse_post, meta=meta)