Exemplo n.º 1
0
    def parse_content(self, response):
        # self.info('牛摩网:[{}]、.状态:[{}]'.format(response.url, response.status))
        title_path = response.xpath('//h1')
        title = title_path.xpath("string(.)").extract_first()
        try:
            for each in response.xpath('//ul[@class="comment_list"]/li'):
                item = BaseItem()
                commenturl = response.url
                username = each.xpath('.//p/text()').extract_first()
                commentdetail = each.xpath('.//dd/text()').extract_first()
                pushtime = each.xpath('./div/div/div/text()').extract_first()

                item['title'] = title + "评论"
                item['bbs_name'] = '牛摩论坛'
                item['sonbbs_name'] = None
                item['username'] = username
                if item['username'] is None:
                    continue
                item['comment_detail'] = commentdetail
                # if not isinstance(item['comment_detail'], str):
                #     continue
                item['comment_url'] = commenturl
                item['push_time'] = pushtime
                item['catch_time'] = time.strftime("%Y-%m-%d %H:%M:%S",
                                                   time.localtime())
                item['car_type'] = title
                item['collection'] = "牛摩网(竞品)"  # TODO 修改表名
                item['usergender'] = None
                item['userlocation'] = None
                item['userage'] = None
                yield item
        except Exception as e:
            self.error('【parse_detail出错】url:{}; line{}:{}'.format(
                response.url, e.__traceback__.tb_lineno, e))
Exemplo n.º 2
0
 def parse_comment(self, response):
     # response.encoding = 'utf8'
     # print(response.text)
     try:
         data = json.loads(response.text)
         comment_list = data['data']['comment_list']
         for key in comment_list.keys():
             for comment in comment_list[key]["comment_info"]:
                 item = BaseItem()
                 item['title'] = response.meta["title"]
                 item['bbs_name'] = '百度贴吧'
                 item['sonbbs_name'] = response.meta["sonbbs_name"]
                 item['username'] = comment["username"]
                 item['comment_detail'] = comment["content"]
                 item['comment_url'] = response.meta["comment_url"]
                 pushtime = time.strftime(
                     "%Y-%m-%d %H:%M:%S",
                     time.localtime(comment["now_time"]))
                 if pushtime < self.start_time:
                     continue
                 item['push_time'] = pushtime
                 item['catch_time'] = time.strftime("%Y-%m-%d %H:%M:%S",
                                                    time.localtime())
                 item['car_type'] = None
                 item['collection'] = "(百度贴吧)" + "自动驾驶"
                 item['usergender'] = None
                 item['userlocation'] = None
                 item['userage'] = None
                 item["kw"] = None
                 yield item
     except Exception as e:
         self.error('【无评论】url:{}; line{}:{}'.format(
             response.url, e.__traceback__.tb_lineno, e))
Exemplo n.º 3
0
    def parseContent(self, response):
        """
        解析响应的数据,获取需要的数据字段
        :param response: 响应的数据
        :return:
        """
        title = response.xpath(
            '//span[@id="thread_subject"]/text()').extract_first()
        sonbbs_name = response.xpath(
            '//div[@class="z"]/a[4]/text()').extract_first()
        try:
            for each in response.xpath(
                    '//div[@id="ct"]/div[@id="postlist"]/div'):
                # 获取用户名
                username = each.xpath(
                    './/div[@class="authi"]/a[@class="xw1"]/text()'
                ).extract_first()
                if username is None:
                    continue
                userurl = 'http://motorcycle.sh.cn/' + each.xpath(
                    './/div[@class="authi"]/a[@class="xw1"]/@href'
                ).extract_first()
                usermsg = self.parse_user(userurl)
                # 获取评论详情
                comt_path = each.xpath('.//td[@class="t_f"]')
                comt = comt_path.xpath('string(.)').extract_first()
                # 获取论坛url
                comturl = response.url
                # 获取评论时间
                pushtime = each.xpath(
                    './/div[@class="authi"]/em/text()').extract_first()

                item = BaseItem()
                item['title'] = title
                item['bbs_name'] = '摩托迷'
                item['sonbbs_name'] = sonbbs_name
                item['username'] = username
                item['comment_detail'] = comt
                item['comment_url'] = comturl
                item['push_time'] = pushtime
                item['catch_time'] = time.strftime("%Y-%m-%d %H:%M:%S",
                                                   time.localtime())
                item['car_type'] = None
                item['collection'] = "摩托迷(3.22)"
                item['usergender'] = usermsg[0]
                item['userlocation'] = usermsg[1]
                if item['userlocation'] is None and "摩友交流区" in sonbbs_name:
                    item['userlocation'] = sonbbs_name.replace("摩友交流区", '')
                item['userage'] = usermsg[2]
                yield item
        except Exception as e:
            self.error('【parse_detail出错】url:{}; line{}:{}'.format(
                response.url, e.__traceback__.tb_lineno, e))
Exemplo n.º 4
0
    def parseAutoBBS(self, response):
        try:
            item = BaseItem()
            # TODO 检测是否被重定向,若爬其他论坛需修改此处
            html = str(response.body)
            if "本田摩托车论坛" not in html:
                item['comment_url'] = response.url
                item['collection'] = "test"
                yield item
                return
            title = response.xpath(
                '//div[@id="consnav"]/span[4]/text()').extract_first()
            bbsname = response.xpath(
                '//div[@id="consnav"]/span[2]/a/text()').extract_first()
            for each in response.xpath(
                    '//div[@id="maxwrap-reply"]/div[@class="clearfix contstxt outer-section"]'
            ):
                username = each.xpath('.//li[@class="txtcenter fw"]/a/text()'
                                      ).extract_first().strip()
                # userloc = each.xpath('.//ul[@class="leftlist"]/li[6]/a/text()').extract_first()
                uid = each.xpath('./@uid').extract_first()
                userurl = "https://i.autohome.com.cn/{}/info".format(uid)
                usermsg = self.parse_user(userurl)
                pushtime = each.xpath(
                    './/span[@xname="date"]/text()').extract_first()
                comtpath = each.xpath('.//div[@class="x-reply font14"]')
                comtstr = comtpath.xpath('string(.)').extract_first().strip()

                item['title'] = title
                item['bbs_name'] = '汽车之家'
                item['sonbbs_name'] = bbsname
                item['username'] = username
                item['comment_detail'] = comtstr
                item['comment_url'] = response.url
                item['push_time'] = pushtime
                item['catch_time'] = time.strftime("%Y-%m-%d %H:%M:%S",
                                                   time.localtime())
                item['car_type'] = None
                item['collection'] = "汽车之家(test)"
                item['usergender'] = usermsg[0]
                item['userlocation'] = usermsg[1]
                item['userage'] = usermsg[2]
                yield item
        except Exception as e:
            self.error('【parse_detail出错】url:{}; line{}:{}'.format(
                response.url, e.__traceback__.tb_lineno, e))
Exemplo n.º 5
0
    def parseContent(self, response):
        title = response.xpath('//span[@id="thread_subject"]/text()').extract_first()
        sonbbs_name = response.xpath('//div[@class="z"]/a[4]/text()').extract_first()
        try:
            for each in response.xpath('//div[@id="ct"]/div[@id="postlist"]/div'):
                # 获取用户名
                username = each.xpath('.//div[@class="authi"]/a[@class="xw1"]/text()').extract_first()
                if username is None:
                    continue
                urlstr = each.xpath('.//div[@class="authi"]/a[@class="xw1"]/@href').extract_first()
                uid = re.search('\d+',urlstr).group()
                userurl = 'http://www.i-motor.com.cn/home.php?mod=space&uid={}&do=profile'.format(uid)
                usermsg = self.parse_user(userurl)
                # 获取评论详情
                comt_path = each.xpath('.//td[@class="t_f"]')
                comt = comt_path.xpath('string(.)').extract_first()
                # 获取论坛url
                comturl = response.url
                # 获取评论时间
                pushtime = each.xpath('.//div[@class="authi"]/em/text()').extract_first()

                item = BaseItem()
                item['title'] = title
                item['bbs_name'] = 'imotor'
                item['sonbbs_name'] = sonbbs_name
                item['username'] = username
                item['comment_detail'] = comt
                item['comment_url'] = comturl
                item['push_time'] = pushtime
                item['catch_time'] = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
                item['car_type'] = None
                item['collection'] = "imotor"
                item['usergender'] = usermsg[0]
                item['userlocation'] = usermsg[1]
                item['userage'] = usermsg[2]
                yield item
        except Exception as e:
            self.error('【parse_detail出错】url:{}; line{}:{}'.format(response.url, e.__traceback__.tb_lineno, e))
Exemplo n.º 6
0
    def parse591moto(self, response):
        title = response.xpath('//h1/span/text()').extract_first()
        sonbbsname = response.xpath(
            '//div[@id="pt"]/div[@class="z"]/a[4]/text()').extract_first()
        try:
            for each in response.xpath(
                    '//div[@id="postlist"]/div[starts-with(@id,"post_")]'):
                username = each.xpath(
                    './/div[@class="authi"]/a[@class="xw1"]/text()'
                ).extract_first()
                pushtime = each.xpath(
                    './/div[@class="authi"]/em/text()').extract_first()
                if pushtime == "发表于 ":
                    pushtime = each.xpath(
                        './/div[@class="authi"]/em/span/@title').extract_first(
                        )
                comtpath = each.xpath('.//td[@class="t_f"]')
                comtstr = comtpath.xpath('string(.)').extract_first()

                item = BaseItem()
                item['title'] = title
                item['bbs_name'] = '591摩托论坛'
                item['sonbbs_name'] = sonbbsname
                item['username'] = username
                item['comment_detail'] = comtstr
                item['comment_url'] = response.url
                item['push_time'] = pushtime
                item['catch_time'] = time.strftime("%Y-%m-%d %H:%M:%S",
                                                   time.localtime())
                item['car_type'] = None
                item['collection'] = "591摩托"
                item['usergender'] = None
                item['userlocation'] = None
                item['userage'] = None
                yield item
        except Exception as e:
            self.error('【parse_detail出错】url:{}; line{}:{}'.format(
                response.url, e.__traceback__.tb_lineno, e))
Exemplo n.º 7
0
    def parse_reply(self, response):
        bbsname = response.xpath(
            '//div[@class="card_title "]/a/text()').extract_first()
        title = response.xpath('//h3/text()').extract_first()  # 某些吧标题在h1
        if title is None:
            title = response.xpath('//h1/text()').extract_first()
        try:
            for each in response.xpath('//div[starts-with(@class,"l_post")]'):
                username = each.xpath(
                    './/li[@class="d_name"]/a/text()').extract_first()
                pushtime = None
                try:
                    data = each.xpath('./@data-field').extract_first()
                    pushtime = json.loads(data)["content"]["date"]
                except:
                    pass
                if not pushtime:
                    pushtime = each.xpath(
                        './/span[@class="tail-info"][3]/text() | .//span[@class="tail-info"][2]/text()'
                    ).extract_first()  # 移动端|pc端
                if not pushtime:
                    pushtime = each.xpath(
                        './/*[@class="p_tail"]/li[2]/span/text()'
                    ).extract_first()
                if not pushtime:
                    continue
                if pushtime < self.start_time:
                    continue
                comtpath = each.xpath(
                    './/div[starts-with(@id,"post_content_")]')
                comtstr = comtpath.xpath('string(.)').extract_first()
                if comtstr is None:
                    continue
                item = BaseItem()
                item['title'] = title
                item['bbs_name'] = '百度贴吧'
                item['sonbbs_name'] = bbsname
                item['username'] = username
                item['comment_detail'] = comtstr
                item['comment_url'] = response.url
                item['push_time'] = pushtime
                item['catch_time'] = time.strftime("%Y-%m-%d %H:%M:%S",
                                                   time.localtime())
                item['car_type'] = None
                item['collection'] = "(百度贴吧)" + "自动驾驶"
                item['usergender'] = None
                item['userlocation'] = None
                item['userage'] = None
                item["kw"] = None
                yield item

            # 发送获取回复的请求
            tid = re.search('p/(\d+)', response.url).group(1)
            pg_num = re.search('pn=(\d+)', response.url)
            total_comment_url = "https://tieba.baidu.com/p/totalComment?tid={}".format(
                tid)
            if pg_num:
                pg_num = pg_num.group(1)
                total_comment_url += "&pn={}".format(pg_num)
            meta = {
                "title": title,
                "sonbbs_name": bbsname,
                "comment_url": response.url
            }
            yield scrapy.Request(url=total_comment_url,
                                 callback=self.parse_comment,
                                 meta=meta)
        except Exception as e:
            self.error('【parse_detail出错】url:{}; line{}:{}'.format(
                response.url, e.__traceback__.tb_lineno, e))
Exemplo n.º 8
0
    def parse_tiezi(self, response):
        # response.encoding = "utf-8"
        bbsname = response.xpath(
            '//div[@class="card_title "]/a/text()').extract_first()
        title = response.xpath('//h3/text()').extract_first()  # 某些吧标题在h1
        if title is None:
            title = response.xpath('//h1/text()').extract_first()
        try:
            # 发送获取回复的请求
            tid = re.search('p/(\d+)', response.url).group(1)
            pg_num = re.search('pn=(\d+)', response.url)
            total_comment_url = "https://tieba.baidu.com/p/totalComment?tid={}".format(
                tid)
            if pg_num:
                pg_num = pg_num.group(1)
                total_comment_url += "&pn={}".format(pg_num)
            meta = {
                "title": title,
                "sonbbs_name": bbsname,
                "comment_url": response.url,
                "kw": response.meta["kw"]
            }
            # yield scrapy.Request(url=total_comment_url, callback=self.parse_comment, meta=meta)
            # 3.27若非主题帖,则只爬取当前回复贴
            if self.base_kw not in title:
                base_path = response.xpath(
                    '//div[@id="post_content_{}"]/../../../..'.format(
                        response.meta["pid"]))

                # 处理帖子评论
                reply_num = 0
                try:
                    data = base_path.xpath('./@data-field').extract_first()
                    reply_num = int(json.loads(data)["content"]["comment_num"])
                except:
                    print("获取评论数出错")
                    pass
                # reply_str = base_path.xpath('.//a[@class="lzl_link_unfold" or @class="p_reply_first"]/text()').extract_first()
                # reply_num = re.search(r"\d+", reply_str)
                if reply_num is not 0:
                    meta["pid"] = response.meta["pid"]
                    yield scrapy.Request(url=total_comment_url,
                                         callback=self.parse_comment,
                                         meta=meta)

                username = base_path.xpath(
                    './/li[@class="d_name"]/a/text()').extract_first()
                pushtime = base_path.xpath(
                    './/span[@class="tail-info"][3]/text()').extract_first(
                    )  # 移动端
                if pushtime is None:
                    pushtime = base_path.xpath(
                        './/span[@class="tail-info"][2]/text()').extract_first(
                        )  # PC端
                if pushtime is None:
                    try:
                        data = base_path.xpath('./@data-field').extract_first()
                        pushtime = json.loads(data)["content"]["date"]
                    except:
                        return
                if pushtime < self.start_time:  # 3.26添加时间条件
                    return
                comtpath = base_path.xpath(
                    './/div[starts-with(@id,"post_content_")]')
                comtstr = comtpath.xpath('string(.)').extract_first()
                if comtstr is None:
                    return
                item = BaseItem()
                item['title'] = title
                item['bbs_name'] = '百度贴吧'
                item['sonbbs_name'] = bbsname
                item['username'] = username
                item['comment_detail'] = comtstr
                item['comment_url'] = response.url
                item['push_time'] = pushtime
                if pushtime < self.start_time:  # 3.26添加时间条件
                    return
                item['catch_time'] = time.strftime("%Y-%m-%d %H:%M:%S",
                                                   time.localtime())
                item['car_type'] = None
                item['collection'] = "(百度贴吧搜索)自动驾驶"
                item['usergender'] = None
                item['userlocation'] = None
                item['userage'] = None
                item['kw'] = response.meta["kw"]
                yield item
                return

            else:
                yield scrapy.Request(url=total_comment_url,
                                     callback=self.parse_comment,
                                     meta=meta)
                for each in response.xpath(
                        '//div[starts-with(@class,"l_post")]'):
                    username = each.xpath(
                        './/li[@class="d_name"]/a/text()').extract_first()
                    pushtime = each.xpath(
                        './/span[@class="tail-info"][3]/text()').extract_first(
                        )  # 移动端
                    if pushtime is None:
                        pushtime = each.xpath(
                            './/span[@class="tail-info"][2]/text()'
                        ).extract_first()  # PC端
                    if pushtime is None:
                        try:
                            data = each.xpath('./@data-field').extract_first()
                            pushtime = json.loads(data)["content"]["date"]
                        except:
                            continue
                    if pushtime < self.start_time:  # 3.26添加时间条件
                        continue
                    comtpath = each.xpath(
                        './/div[starts-with(@id,"post_content_")]')
                    comtstr = comtpath.xpath('string(.)').extract_first()
                    if comtstr is None:
                        continue
                    item = BaseItem()
                    item['title'] = title
                    item['bbs_name'] = '百度贴吧'
                    item['sonbbs_name'] = bbsname
                    item['username'] = username
                    item['comment_detail'] = comtstr
                    item['comment_url'] = response.url
                    item['push_time'] = pushtime
                    if pushtime < self.start_time:  # 3.26添加时间条件
                        continue
                    item['catch_time'] = time.strftime("%Y-%m-%d %H:%M:%S",
                                                       time.localtime())
                    item['car_type'] = None
                    item['collection'] = "(百度贴吧搜索)自动驾驶"
                    item['usergender'] = None
                    item['userlocation'] = None
                    item['userage'] = None
                    item['kw'] = response.meta["kw"]
                    yield item

        except Exception as e:
            self.error('【parse_detail出错】url:{}; line{}:{}'.format(
                response.url, e.__traceback__.tb_lineno, e))
Exemplo n.º 9
0
    def parse_content(self, response):
        """解析页面"""
        response.encoding("utf8")
        # self.info('牛摩网:[{}]、.状态:[{}]'.format(response.url, response.status))
        sonbbs_name = response.xpath('//div[@class="navigations"]/a[4]/text()').extract_first()
        if sonbbs_name is None:
            sonbbs_name = response.xpath('//div[@class="navigations"]/a[3]/text()').extract_first()
        title = response.xpath('//p[@style="position:relative;"]/text()').extract_first()
        try:
            for each in response.xpath('//div[@id="club_content_list"]'):
                commenturl = response.url
                username = each.xpath('.//strong/text()').extract_first()
                pushtime_str = each.xpath('.//span[@style="float:left"]/text()').extract_first()
                # 处理时间
                try:
                    pushtime = pushtime_str.replace(' 发表于:', '')
                    if re.search("昨天", pushtime):
                        yestoday = time.strftime("%Y/%m/%d", time.localtime(time.time() - 86400))
                        pushtime = pushtime.replace('昨天', yestoday)
                    if re.search('前天', pushtime):
                        Byestoday = time.strftime("%Y/%m/%d", time.localtime(time.time() - 172800))
                        pushtime = pushtime.replace('前天', Byestoday)
                    if re.search('今天', pushtime):
                        Byestoday = time.strftime("%Y/%m/%d", time.localtime(time.time()))
                        pushtime = pushtime.replace('今天', Byestoday)
                    if re.search('小时前', pushtime):
                        NUM = int(re.search('\d+', pushtime).group())
                        sec = NUM * 60 * 60
                        today = time.strftime("%Y/%m/%d %H:%M:%S", time.localtime(time.time() - sec))
                        pushtime = pushtime.replace("{} 小时前".format(NUM), today)
                    if re.search('分钟前', pushtime):
                        NUM = int(re.search('\d+', pushtime).group())
                        sec = NUM * 60
                        today = time.strftime("%Y/%m/%d %H:%M:%S", time.localtime(time.time() - sec))
                        pushtime = pushtime.replace("{} 分钟前".format(NUM), today)
                except:
                    continue

                # 若无文字,抛弃此对象
                comtstr = None
                try:
                    comt = each.xpath('.//div[@class="clubcontent"]/node()').extract()
                    # 获取并清洗帖子内容
                    comtstr = ''.join(comt)
                    rush = ['\r', '<.*?>', '\xa0', '\n']
                    for item in rush:
                        comtstr = re.sub(item, '', comtstr)
                except:
                    pass
                item = BaseItem()
                item['title'] = title
                item['bbs_name'] = '牛摩论坛'
                item['sonbbs_name'] = sonbbs_name
                item['username'] = username
                if item['username'] is None:
                    continue
                item['comment_detail'] = comtstr
                if not isinstance(item['comment_detail'], str):
                    continue
                item['comment_url'] = commenturl
                item['push_time'] = pushtime
                item['catch_time'] = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
                item['car_type'] = None
                item['collection'] = "牛摩网(竞品)"  # TODO 修改表名
                item['usergender'] = None
                item['userlocation'] = None
                item['userage'] = None
                yield item
        except Exception as e:
            self.error('【parse_detail出错】{},{}'.format(response.url, e))