def parse_content(self, response): # self.info('牛摩网:[{}]、.状态:[{}]'.format(response.url, response.status)) title_path = response.xpath('//h1') title = title_path.xpath("string(.)").extract_first() try: for each in response.xpath('//ul[@class="comment_list"]/li'): item = BaseItem() commenturl = response.url username = each.xpath('.//p/text()').extract_first() commentdetail = each.xpath('.//dd/text()').extract_first() pushtime = each.xpath('./div/div/div/text()').extract_first() item['title'] = title + "评论" item['bbs_name'] = '牛摩论坛' item['sonbbs_name'] = None item['username'] = username if item['username'] is None: continue item['comment_detail'] = commentdetail # if not isinstance(item['comment_detail'], str): # continue item['comment_url'] = commenturl item['push_time'] = pushtime item['catch_time'] = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()) item['car_type'] = title item['collection'] = "牛摩网(竞品)" # TODO 修改表名 item['usergender'] = None item['userlocation'] = None item['userage'] = None yield item except Exception as e: self.error('【parse_detail出错】url:{}; line{}:{}'.format( response.url, e.__traceback__.tb_lineno, e))
def parse_comment(self, response): # response.encoding = 'utf8' # print(response.text) try: data = json.loads(response.text) comment_list = data['data']['comment_list'] for key in comment_list.keys(): for comment in comment_list[key]["comment_info"]: item = BaseItem() item['title'] = response.meta["title"] item['bbs_name'] = '百度贴吧' item['sonbbs_name'] = response.meta["sonbbs_name"] item['username'] = comment["username"] item['comment_detail'] = comment["content"] item['comment_url'] = response.meta["comment_url"] pushtime = time.strftime( "%Y-%m-%d %H:%M:%S", time.localtime(comment["now_time"])) if pushtime < self.start_time: continue item['push_time'] = pushtime item['catch_time'] = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()) item['car_type'] = None item['collection'] = "(百度贴吧)" + "自动驾驶" item['usergender'] = None item['userlocation'] = None item['userage'] = None item["kw"] = None yield item except Exception as e: self.error('【无评论】url:{}; line{}:{}'.format( response.url, e.__traceback__.tb_lineno, e))
def parseContent(self, response): """ 解析响应的数据,获取需要的数据字段 :param response: 响应的数据 :return: """ title = response.xpath( '//span[@id="thread_subject"]/text()').extract_first() sonbbs_name = response.xpath( '//div[@class="z"]/a[4]/text()').extract_first() try: for each in response.xpath( '//div[@id="ct"]/div[@id="postlist"]/div'): # 获取用户名 username = each.xpath( './/div[@class="authi"]/a[@class="xw1"]/text()' ).extract_first() if username is None: continue userurl = 'http://motorcycle.sh.cn/' + each.xpath( './/div[@class="authi"]/a[@class="xw1"]/@href' ).extract_first() usermsg = self.parse_user(userurl) # 获取评论详情 comt_path = each.xpath('.//td[@class="t_f"]') comt = comt_path.xpath('string(.)').extract_first() # 获取论坛url comturl = response.url # 获取评论时间 pushtime = each.xpath( './/div[@class="authi"]/em/text()').extract_first() item = BaseItem() item['title'] = title item['bbs_name'] = '摩托迷' item['sonbbs_name'] = sonbbs_name item['username'] = username item['comment_detail'] = comt item['comment_url'] = comturl item['push_time'] = pushtime item['catch_time'] = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()) item['car_type'] = None item['collection'] = "摩托迷(3.22)" item['usergender'] = usermsg[0] item['userlocation'] = usermsg[1] if item['userlocation'] is None and "摩友交流区" in sonbbs_name: item['userlocation'] = sonbbs_name.replace("摩友交流区", '') item['userage'] = usermsg[2] yield item except Exception as e: self.error('【parse_detail出错】url:{}; line{}:{}'.format( response.url, e.__traceback__.tb_lineno, e))
def parseAutoBBS(self, response): try: item = BaseItem() # TODO 检测是否被重定向,若爬其他论坛需修改此处 html = str(response.body) if "本田摩托车论坛" not in html: item['comment_url'] = response.url item['collection'] = "test" yield item return title = response.xpath( '//div[@id="consnav"]/span[4]/text()').extract_first() bbsname = response.xpath( '//div[@id="consnav"]/span[2]/a/text()').extract_first() for each in response.xpath( '//div[@id="maxwrap-reply"]/div[@class="clearfix contstxt outer-section"]' ): username = each.xpath('.//li[@class="txtcenter fw"]/a/text()' ).extract_first().strip() # userloc = each.xpath('.//ul[@class="leftlist"]/li[6]/a/text()').extract_first() uid = each.xpath('./@uid').extract_first() userurl = "https://i.autohome.com.cn/{}/info".format(uid) usermsg = self.parse_user(userurl) pushtime = each.xpath( './/span[@xname="date"]/text()').extract_first() comtpath = each.xpath('.//div[@class="x-reply font14"]') comtstr = comtpath.xpath('string(.)').extract_first().strip() item['title'] = title item['bbs_name'] = '汽车之家' item['sonbbs_name'] = bbsname item['username'] = username item['comment_detail'] = comtstr item['comment_url'] = response.url item['push_time'] = pushtime item['catch_time'] = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()) item['car_type'] = None item['collection'] = "汽车之家(test)" item['usergender'] = usermsg[0] item['userlocation'] = usermsg[1] item['userage'] = usermsg[2] yield item except Exception as e: self.error('【parse_detail出错】url:{}; line{}:{}'.format( response.url, e.__traceback__.tb_lineno, e))
def parseContent(self, response): title = response.xpath('//span[@id="thread_subject"]/text()').extract_first() sonbbs_name = response.xpath('//div[@class="z"]/a[4]/text()').extract_first() try: for each in response.xpath('//div[@id="ct"]/div[@id="postlist"]/div'): # 获取用户名 username = each.xpath('.//div[@class="authi"]/a[@class="xw1"]/text()').extract_first() if username is None: continue urlstr = each.xpath('.//div[@class="authi"]/a[@class="xw1"]/@href').extract_first() uid = re.search('\d+',urlstr).group() userurl = 'http://www.i-motor.com.cn/home.php?mod=space&uid={}&do=profile'.format(uid) usermsg = self.parse_user(userurl) # 获取评论详情 comt_path = each.xpath('.//td[@class="t_f"]') comt = comt_path.xpath('string(.)').extract_first() # 获取论坛url comturl = response.url # 获取评论时间 pushtime = each.xpath('.//div[@class="authi"]/em/text()').extract_first() item = BaseItem() item['title'] = title item['bbs_name'] = 'imotor' item['sonbbs_name'] = sonbbs_name item['username'] = username item['comment_detail'] = comt item['comment_url'] = comturl item['push_time'] = pushtime item['catch_time'] = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()) item['car_type'] = None item['collection'] = "imotor" item['usergender'] = usermsg[0] item['userlocation'] = usermsg[1] item['userage'] = usermsg[2] yield item except Exception as e: self.error('【parse_detail出错】url:{}; line{}:{}'.format(response.url, e.__traceback__.tb_lineno, e))
def parse591moto(self, response): title = response.xpath('//h1/span/text()').extract_first() sonbbsname = response.xpath( '//div[@id="pt"]/div[@class="z"]/a[4]/text()').extract_first() try: for each in response.xpath( '//div[@id="postlist"]/div[starts-with(@id,"post_")]'): username = each.xpath( './/div[@class="authi"]/a[@class="xw1"]/text()' ).extract_first() pushtime = each.xpath( './/div[@class="authi"]/em/text()').extract_first() if pushtime == "发表于 ": pushtime = each.xpath( './/div[@class="authi"]/em/span/@title').extract_first( ) comtpath = each.xpath('.//td[@class="t_f"]') comtstr = comtpath.xpath('string(.)').extract_first() item = BaseItem() item['title'] = title item['bbs_name'] = '591摩托论坛' item['sonbbs_name'] = sonbbsname item['username'] = username item['comment_detail'] = comtstr item['comment_url'] = response.url item['push_time'] = pushtime item['catch_time'] = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()) item['car_type'] = None item['collection'] = "591摩托" item['usergender'] = None item['userlocation'] = None item['userage'] = None yield item except Exception as e: self.error('【parse_detail出错】url:{}; line{}:{}'.format( response.url, e.__traceback__.tb_lineno, e))
def parse_reply(self, response): bbsname = response.xpath( '//div[@class="card_title "]/a/text()').extract_first() title = response.xpath('//h3/text()').extract_first() # 某些吧标题在h1 if title is None: title = response.xpath('//h1/text()').extract_first() try: for each in response.xpath('//div[starts-with(@class,"l_post")]'): username = each.xpath( './/li[@class="d_name"]/a/text()').extract_first() pushtime = None try: data = each.xpath('./@data-field').extract_first() pushtime = json.loads(data)["content"]["date"] except: pass if not pushtime: pushtime = each.xpath( './/span[@class="tail-info"][3]/text() | .//span[@class="tail-info"][2]/text()' ).extract_first() # 移动端|pc端 if not pushtime: pushtime = each.xpath( './/*[@class="p_tail"]/li[2]/span/text()' ).extract_first() if not pushtime: continue if pushtime < self.start_time: continue comtpath = each.xpath( './/div[starts-with(@id,"post_content_")]') comtstr = comtpath.xpath('string(.)').extract_first() if comtstr is None: continue item = BaseItem() item['title'] = title item['bbs_name'] = '百度贴吧' item['sonbbs_name'] = bbsname item['username'] = username item['comment_detail'] = comtstr item['comment_url'] = response.url item['push_time'] = pushtime item['catch_time'] = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()) item['car_type'] = None item['collection'] = "(百度贴吧)" + "自动驾驶" item['usergender'] = None item['userlocation'] = None item['userage'] = None item["kw"] = None yield item # 发送获取回复的请求 tid = re.search('p/(\d+)', response.url).group(1) pg_num = re.search('pn=(\d+)', response.url) total_comment_url = "https://tieba.baidu.com/p/totalComment?tid={}".format( tid) if pg_num: pg_num = pg_num.group(1) total_comment_url += "&pn={}".format(pg_num) meta = { "title": title, "sonbbs_name": bbsname, "comment_url": response.url } yield scrapy.Request(url=total_comment_url, callback=self.parse_comment, meta=meta) except Exception as e: self.error('【parse_detail出错】url:{}; line{}:{}'.format( response.url, e.__traceback__.tb_lineno, e))
def parse_tiezi(self, response): # response.encoding = "utf-8" bbsname = response.xpath( '//div[@class="card_title "]/a/text()').extract_first() title = response.xpath('//h3/text()').extract_first() # 某些吧标题在h1 if title is None: title = response.xpath('//h1/text()').extract_first() try: # 发送获取回复的请求 tid = re.search('p/(\d+)', response.url).group(1) pg_num = re.search('pn=(\d+)', response.url) total_comment_url = "https://tieba.baidu.com/p/totalComment?tid={}".format( tid) if pg_num: pg_num = pg_num.group(1) total_comment_url += "&pn={}".format(pg_num) meta = { "title": title, "sonbbs_name": bbsname, "comment_url": response.url, "kw": response.meta["kw"] } # yield scrapy.Request(url=total_comment_url, callback=self.parse_comment, meta=meta) # 3.27若非主题帖,则只爬取当前回复贴 if self.base_kw not in title: base_path = response.xpath( '//div[@id="post_content_{}"]/../../../..'.format( response.meta["pid"])) # 处理帖子评论 reply_num = 0 try: data = base_path.xpath('./@data-field').extract_first() reply_num = int(json.loads(data)["content"]["comment_num"]) except: print("获取评论数出错") pass # reply_str = base_path.xpath('.//a[@class="lzl_link_unfold" or @class="p_reply_first"]/text()').extract_first() # reply_num = re.search(r"\d+", reply_str) if reply_num is not 0: meta["pid"] = response.meta["pid"] yield scrapy.Request(url=total_comment_url, callback=self.parse_comment, meta=meta) username = base_path.xpath( './/li[@class="d_name"]/a/text()').extract_first() pushtime = base_path.xpath( './/span[@class="tail-info"][3]/text()').extract_first( ) # 移动端 if pushtime is None: pushtime = base_path.xpath( './/span[@class="tail-info"][2]/text()').extract_first( ) # PC端 if pushtime is None: try: data = base_path.xpath('./@data-field').extract_first() pushtime = json.loads(data)["content"]["date"] except: return if pushtime < self.start_time: # 3.26添加时间条件 return comtpath = base_path.xpath( './/div[starts-with(@id,"post_content_")]') comtstr = comtpath.xpath('string(.)').extract_first() if comtstr is None: return item = BaseItem() item['title'] = title item['bbs_name'] = '百度贴吧' item['sonbbs_name'] = bbsname item['username'] = username item['comment_detail'] = comtstr item['comment_url'] = response.url item['push_time'] = pushtime if pushtime < self.start_time: # 3.26添加时间条件 return item['catch_time'] = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()) item['car_type'] = None item['collection'] = "(百度贴吧搜索)自动驾驶" item['usergender'] = None item['userlocation'] = None item['userage'] = None item['kw'] = response.meta["kw"] yield item return else: yield scrapy.Request(url=total_comment_url, callback=self.parse_comment, meta=meta) for each in response.xpath( '//div[starts-with(@class,"l_post")]'): username = each.xpath( './/li[@class="d_name"]/a/text()').extract_first() pushtime = each.xpath( './/span[@class="tail-info"][3]/text()').extract_first( ) # 移动端 if pushtime is None: pushtime = each.xpath( './/span[@class="tail-info"][2]/text()' ).extract_first() # PC端 if pushtime is None: try: data = each.xpath('./@data-field').extract_first() pushtime = json.loads(data)["content"]["date"] except: continue if pushtime < self.start_time: # 3.26添加时间条件 continue comtpath = each.xpath( './/div[starts-with(@id,"post_content_")]') comtstr = comtpath.xpath('string(.)').extract_first() if comtstr is None: continue item = BaseItem() item['title'] = title item['bbs_name'] = '百度贴吧' item['sonbbs_name'] = bbsname item['username'] = username item['comment_detail'] = comtstr item['comment_url'] = response.url item['push_time'] = pushtime if pushtime < self.start_time: # 3.26添加时间条件 continue item['catch_time'] = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()) item['car_type'] = None item['collection'] = "(百度贴吧搜索)自动驾驶" item['usergender'] = None item['userlocation'] = None item['userage'] = None item['kw'] = response.meta["kw"] yield item except Exception as e: self.error('【parse_detail出错】url:{}; line{}:{}'.format( response.url, e.__traceback__.tb_lineno, e))
def parse_content(self, response): """解析页面""" response.encoding("utf8") # self.info('牛摩网:[{}]、.状态:[{}]'.format(response.url, response.status)) sonbbs_name = response.xpath('//div[@class="navigations"]/a[4]/text()').extract_first() if sonbbs_name is None: sonbbs_name = response.xpath('//div[@class="navigations"]/a[3]/text()').extract_first() title = response.xpath('//p[@style="position:relative;"]/text()').extract_first() try: for each in response.xpath('//div[@id="club_content_list"]'): commenturl = response.url username = each.xpath('.//strong/text()').extract_first() pushtime_str = each.xpath('.//span[@style="float:left"]/text()').extract_first() # 处理时间 try: pushtime = pushtime_str.replace(' 发表于:', '') if re.search("昨天", pushtime): yestoday = time.strftime("%Y/%m/%d", time.localtime(time.time() - 86400)) pushtime = pushtime.replace('昨天', yestoday) if re.search('前天', pushtime): Byestoday = time.strftime("%Y/%m/%d", time.localtime(time.time() - 172800)) pushtime = pushtime.replace('前天', Byestoday) if re.search('今天', pushtime): Byestoday = time.strftime("%Y/%m/%d", time.localtime(time.time())) pushtime = pushtime.replace('今天', Byestoday) if re.search('小时前', pushtime): NUM = int(re.search('\d+', pushtime).group()) sec = NUM * 60 * 60 today = time.strftime("%Y/%m/%d %H:%M:%S", time.localtime(time.time() - sec)) pushtime = pushtime.replace("{} 小时前".format(NUM), today) if re.search('分钟前', pushtime): NUM = int(re.search('\d+', pushtime).group()) sec = NUM * 60 today = time.strftime("%Y/%m/%d %H:%M:%S", time.localtime(time.time() - sec)) pushtime = pushtime.replace("{} 分钟前".format(NUM), today) except: continue # 若无文字,抛弃此对象 comtstr = None try: comt = each.xpath('.//div[@class="clubcontent"]/node()').extract() # 获取并清洗帖子内容 comtstr = ''.join(comt) rush = ['\r', '<.*?>', '\xa0', '\n'] for item in rush: comtstr = re.sub(item, '', comtstr) except: pass item = BaseItem() item['title'] = title item['bbs_name'] = '牛摩论坛' item['sonbbs_name'] = sonbbs_name item['username'] = username if item['username'] is None: continue item['comment_detail'] = comtstr if not isinstance(item['comment_detail'], str): continue item['comment_url'] = commenturl item['push_time'] = pushtime item['catch_time'] = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()) item['car_type'] = None item['collection'] = "牛摩网(竞品)" # TODO 修改表名 item['usergender'] = None item['userlocation'] = None item['userage'] = None yield item except Exception as e: self.error('【parse_detail出错】{},{}'.format(response.url, e))