Exemplo n.º 1
0
 def parse_login(self, response):
     # print("正在登陆....")
     # 对响应体判断是否登录成功
     json_res = json.loads(response.text)
     if json_res["retcode"] == 50060000:
         print('出现验证码,请在常用登陆地运行')
         print(json_res["data"]['errurl'])
     elif json_res["retcode"] == 20000000:
         print("登陆成功")
         args = SARunner().parser()
         keyword_list = SARunner().keyworld_list(args.anaentities)
         keyword = keyword_list.replace(
             u"|",
             "~",
         )
         seek_url = "https://weibo.cn/search/"
         fd = {
             'advancedfilter': '1',
             'keyword': keyword,
             'nick': '',
             'starttime': self.starttime,
             'endtime': self.endtime,
             'sort': 'time',
             'smblog': '搜索'
         }
         print('搜索关键词:', keyword)
         yield scrapy.FormRequest(
             url=seek_url,
             formdata=fd,
             callback=self.parse_info,
         )
     else:
         print('登陆失败!')
Exemplo n.º 2
0
 def parse_main(self, response):
     item = response.meta['item']
     item['article'] = response.xpath(
         "//div[@class ='p-right left']//div[@id='p-detail']//p|"
         "//div[@id='content']//p|"
         "//div[@class='content']//p|"
         "//div[@class ='contant clearfix']/div[@class ='xl']//p|"
         "//div[@id ='Content']//p|"
         "//div[@class ='zj_left']/div[@class ='zj_nr']//p|"
         "//td[@class='text_con_16_33']//p|"
         "//div[@class ='content pack']//p|"
         "//div[@class = 'article']//p|"
         "//div[@class ='main-content-box']//p|"
         "//div[@id ='nr_wz']//p").xpath('string(.)').extract()
     item['TID'] = re.findall(r'c_.{1,}htm', item['href'])[0][2:-4]
     yield item
     article = Article(tid=item['TID'],
                       channel_id=11,
                       title=item['title'],
                       content=item['article'],
                       publish_datetime=item['time'],
                       url=item['href'],
                       author_name=item['source'],
                       digest=item['intro'])
     self.r.append(article)
     if len(self.r) == len(self.R):
         print(len(self.r))
         print('爬虫结束,开始热度分析')
         SARunner().article_List(self.r)
Exemplo n.º 3
0
 def parse_main(self, response):
     item = RMWspider1Item()
     item['title'] = response.meta['title'][0]
     item['time'] = response.meta['time']
     item['intro'] = response.meta['intro'][0].replace('[', '', 1).replace(
         ']',
         '',
     )
     item['href'] = response.meta['href']
     item['TID'] = re.findall(r'/c.{1,}html', item['href'])[0][1:-5]
     if 'people' in item['TID']:
         item['TID'] = re.findall(r'/c.{1,}', item['TID'])[0][1:]
     item['source'] = response.xpath(
         "//div[@class = 'artOri']/a/text()|"
         "//div[@class='box01']//a/text()|"
         "//div[@class='text_c']/p//a/text()|"
         "//div[@class = 'msgBox']//a/text()|"
         "//div[@class = 'page_c']/div[@class = 'fr']/a/text()|"
         "//div[@class = 'w1000 p2']//a/text()|"
         "//div[@class = 'p2j_text fl']/h2/a/text()").extract_first()
     item['article'] = response.xpath(
         "//div[@id='rwb_zw']//p|"
         "//div[@class='show_text']//p|"
         "//div[@class='artDet']//p|"
         "//div[@class='text_con clearfix']//p|"
         "//div[@class = 'content clear clearfix']//p|"
         "//div[@id = 'p_content']//p|"
         "//div[@class = 'box_con']//p|"
         "//div[@class = 'text_show']//p|"
         "//div[@class = 'gray box_text']//p|"
         "//div[@class = 'text_box clearfix']//p").xpath(
             'string(.)').extract()
     yield item
     article = Article(tid=item['TID'],
                       channel_id=5,
                       title=item['title'],
                       content=item['article'],
                       publish_datetime=item['time'],
                       url=item['href'],
                       author_name=item['source'],
                       digest=item['intro'])
     self.r.append(article)
     if len(self.R) == len(self.r):
         print(len(self.r))
         print('爬虫结束,开始热度分析')
         SARunner().article_List(self.r)
Exemplo n.º 4
0
    def parse_info(self, response):
        weibo_list = response.xpath("//div[@class='c' and @id]")
        for weibo in weibo_list:
            item = Weibospider1Item()
            div = weibo.xpath("./div")
            if len(div) == 1:
                # 微博类型
                item["category"] = "无图原创"
                item["author"] = weibo.xpath(
                    "./div/a[@class='nk']/text()").extract_first()
                item['author_id'] = weibo.xpath(
                    "./div[1]/a[@class='nk']/@href").extract_first()
                item["content"] = weibo.xpath(
                    "./div/span[@class='ctt']").xpath('string(.)').extract()
                img = weibo.xpath("./div/span[@class='ctt']/img/@src")
                if len(img) == 1:
                    item["content"] = weibo.xpath(
                        "./div/text()|./div/span[@class='ctt']//text()"
                    ).extract()
                item["dianzan"] = weibo.xpath("./div/a/text()").extract()[-4]
                item["relay"] = weibo.xpath("./div/a/text()").extract()[-3]
                item["comment"] = weibo.xpath(
                    "./div/a[@class='cc']/text()").extract_first()
                item["comment_url"] = weibo.xpath(
                    "./div/a[@class='cc']/@href").extract_first()
                item["send_time"] = weibo.xpath(
                    "./div/span[@class='ct']/text()").extract_first()
                item["reason"] = None
                item["img_url"] = None
                item['reason_name'] = None
                item['reason_id'] = None

            elif len(div) == 2:
                item["category"] = ""
                item["content"] = weibo.xpath("./div[1]/span[@class='ctt']"
                                              ).xpath('string(.)').extract()
                img = weibo.xpath("./div/span[@class='ctt']/img/@src")
                if len(img) == 1:
                    item["content"] = weibo.xpath(
                        "./div[1]/text()|./div[1]/span[@class='ctt']//text()"
                    ).extract()
                item["relay"] = weibo.xpath("./div[2]/a/text()").extract()[-3]
                item["comment"] = weibo.xpath(
                    "./div[2]/a[@class='cc']/text()").extract_first()
                item["reason"] = None
                img = weibo.xpath("./div[2]//img[@class='ib']/@src")
                if len(img) == 0:
                    # 无图转发
                    item['category'] = "无图转发"
                    item["author"] = weibo.xpath(
                        "./div/span[@class = 'cmt']/a/text()").extract_first()
                    item['author_id'] = weibo.xpath(
                        "./div[1]/a[@class='nk']/@href").extract_first()
                    item['reason_name'] = weibo.xpath(
                        "./div[1]/span[@class = 'cmt']/a/text()"
                    ).extract_first()
                    item['reason_id'] = weibo.xpath(
                        "./div[1]/span[@class = 'cmt']/a/@href").extract_first(
                        )
                    item["dianzan"] = weibo.xpath(
                        "./div[2]/a/text()").extract()[-4]
                    item["reason"] = weibo.xpath(
                        "./div[2]/text()|./div[2]//span[@class='kt']/text()"
                    ).extract()
                    item["comment_url"] = weibo.xpath(
                        "./div[2]/a[@class='cc']/@href").extract_first()
                    item["img_url"] = None
                    item["send_time"] = weibo.xpath(
                        "./div[2]/span[@class='ct']/text()").extract_first()

                else:
                    # 有图原创
                    item['category'] = "有图原创"
                    item["author"] = weibo.xpath(
                        "./div/a[@class='nk']/text()").extract_first()
                    item['author_id'] = weibo.xpath(
                        "./div[1]/a[@class='nk']/@href").extract_first()
                    item['reason_name'] = None
                    item['reason_id'] = None
                    item["dianzan"] = weibo.xpath(
                        "./div[2]/a/text()").extract()[-4]
                    item["img_url"] = weibo.xpath(
                        "./div[2]//img[@class='ib']/@src").extract_first()
                    item["comment_url"] = weibo.xpath(
                        "./div[2]/a[@class='cc']/@href").extract_first()
                    item["send_time"] = weibo.xpath(
                        "./div[2]/span[@class='ct']/text()").extract_first()

            else:
                # len(div) == 3
                item["category"] = "带图片转发"
                item["author"] = weibo.xpath(
                    "./div[1]/a[@class='nk']/text()").extract_first()
                item['author_id'] = weibo.xpath(
                    "./div[1]/a[@class='nk']/@href").extract_first()
                item['reason_name'] = weibo.xpath(
                    "./div[1]/span[@class = 'cmt']/a/text()").extract_first()
                item['reason_id'] = weibo.xpath(
                    "./div[1]/span[@class = 'cmt']/a/@href").extract_first()
                item["content"] = weibo.xpath("./div[1]/span[@class = 'ctt']"
                                              ).xpath('string(.)').extract()
                img = weibo.xpath("./div[1]/span[@class='ctt']/img/@src")
                if len(img) == 1:
                    item["content"] = weibo.xpath(
                        "./div[1]/text()|./div[1]/span[@class='ctt']//text()"
                    ).extract()
                item["send_time"] = weibo.xpath(
                    "./div[3]/span[@class='ct']/text()").extract_first()
                item["dianzan"] = weibo.xpath(
                    "./div[3]/a/text()").extract()[-4]
                item["relay"] = weibo.xpath("./div[3]/a/text()").extract()[-3]
                item["comment"] = weibo.xpath(
                    "./div[3]/a[@class='cc']/text()").extract_first()
                item["comment_url"] = weibo.xpath(
                    "./div[3]/a[@class='cc']/@href").extract_first()
                item["img_url"] = weibo.xpath(
                    "./div[2]//img[@class='ib']/@src").extract_first()
                item["reason"] = weibo.xpath(
                    "./div[3]/text()|./div[3]//span[@class='kt']/text()"
                ).extract()
            item['relay_url'] = ''

            item['TID'] = re.findall(r'uid=.{1,}&',
                                     item["comment_url"])[0][4:-1]
            a = weibo.xpath("//a[@class='nk']/@href").extract()
            yield item
            article = Article(tid=item['TID'],
                              channel_id=9,
                              content=item['content'],
                              publish_datetime=item['send_time'],
                              url=item['comment_url'],
                              title=item['content'][0:100],
                              author_id=item['author_id'],
                              author_name=item['author'])
            article.statistics = ArticleStatistics(
                tid=item['TID'],
                channel_id=9,
                reply_count=item['comment'],
                forward_count=item['relay'],
                like_count=item['dianzan'],
            )
            if int(item['relay']) > 0:
                self.relay_url_list.append(item['relay_url'])

            self.r.append(article)
            self.name_url_list.append(a)

        num_page = response.xpath(
            "//div[@id='pagelist']/form/div/text()").extract()
        num_page = [i.replace(
            u"\xa0",
            "",
        ) for i in num_page]
        num_page = [i for i in num_page if len(i) > 0][0]
        num_page = re.findall(r'\d+', num_page)

        print('正在爬取第', num_page[0], '页', num_page[1])
        max_page = NUM_PAGE
        if max_page is None:
            max_page = int(num_page[1])
        if int(num_page[0]) == max_page:
            L = []
            for L1 in self.name_url_list:
                L += L1
            for url_1 in L:
                with open(os_file.a + '\\crawler_url.txt',
                          'a',
                          encoding='utf-8') as f:
                    f.write(url_1 + "\n")

            print('页数上限,搜索页数据爬取完毕')
            print('爬虫结束,开始热度分析')
            SARunner().article_List(self.r)

            print("爬取微博数:", len(self.r))
            # print('开始爬取用户详情页数据,一共有', self.L2, '个非重复用户')
            # 爬取作者头像 id 关注 粉丝
            with open(os_file.a + '\\crawler_url.txt', 'r',
                      encoding='utf-8') as f:
                urls = f.readlines()
                # 获取待爬个数
                # 去重
                L2 = {}.fromkeys(urls).keys()
                self.L2 = len(L2)
                print('开始爬取用户详情页数据,一共有', self.L2, '个非重复用户')
                for url in L2:
                    yield scrapy.FormRequest(url=url,
                                             callback=self.parse_info_detail,
                                             dont_filter=True)
        else:
            next_url = response.xpath(
                "//a[text() = '下页']/@href").extract_first()
            next_url = urllib.parse.urljoin(response.url, next_url)
            yield scrapy.Request(next_url,
                                 callback=self.parse_info,
                                 dont_filter=True)
Exemplo n.º 5
0
    def parse_main(self, response):
        item = XinLangspider1Item()
        item['intro'] = str(response.meta["intro"]).replace(
            u"...",
            "",
        ).replace(
            u"']",
            "",
        ).replace(
            u"['",
            "",
        )
        item['href'] = response.meta["href"]
        item['time'] = response.meta['time']
        item['title_main'] = response.meta['title']
        item['article'] = response.xpath(
            "//div[@id = 'artibody']//p//text()|//div[@id = 'article']//p//text()"
        ).extract()
        item['source'] = response.xpath(
            "//a[@class = 'source ent-source']/text()|//span[@class = 'source ent-source']/text()"
        ).extract()
        item['TID'] = None

        a = re.findall(r'http.{1,}sina', item['href'])[0][7:-5]
        a = a.replace(
            u"/",
            "",
        )

        if a in 'k':
            item['TID'] = re.findall(r'article_.{1,}_', item['href'])[0][8:-1]
        else:
            item['TID'] = re.findall(r'-ih.{1,}shtml', item['href'])[0][1:-6]

        if a in xw_type.cs:
            item['source'] = response.xpath(
                "//span[@id = 'art_source']/text()").extract()
            item['article'] = response.xpath(
                "//div[@class = 'article-body main-body']//p//text()").extract(
                )
        elif a in xw_type.ss:
            item['source'] = response.xpath(
                "//a[@class = 'source content-color']/text()|//span[@class ='source content-color']/text()"
            ).extract()
        elif a in xw_type.xw:
            item['article'] = response.xpath("//div[@id = 'article']").xpath(
                'string(.)').extract()
            item['source'] = response.xpath(
                "//a[@class = 'source']/text()").extract()
        elif a in xw_type.bk:
            item['source'] = '新浪博客'
            item['article'] = response.xpath(
                "//div[@id='sina_keyword_ad_area2']/div/font|//div[@id='sina_keyword_ad_area2']/p/font"
            ).xpath('string(.)').extract()

        # 手机版网站
        if len(item['article']) == 0 and len(item['source']) == 0:
            item['article'] = response.xpath(
                "//section[@class = 'art_pic_card art_content']/p//text()"
            ).extract()
            item['source'] = response.xpath(
                "//h2[@class ='weibo_user']/text()").extract()

        yield item
        article = Article(tid=item['TID'],
                          channel_id=3,
                          title=item['title_main'],
                          content=item['article'],
                          publish_datetime=item['time'],
                          url=item['href'],
                          author_name=item['source'],
                          digest=item['intro'])

        self.R.append(article)
        if len(self.r) == len(self.R):
            print(len(self.R))
            print('开始保存数据库')
            print('爬虫结束,开始热度分析')
            SARunner().article_List(self.R)
Exemplo n.º 6
0
class XlSpider(scrapy.Spider):
    name = 'xl'
    allowed_domains = ['sina.com']
    custom_settings = {
        'ITEM_PIPELINES': {
            'crawler.pipelines.XinLangPipeline': 300,
        },
    }
    r = []
    R = []
    MAX_PAGE = XL_MAX_PAGE
    args = SARunner().parser()
    keyword_list = SARunner().keyworld_list(args.anaentities)
    if '|' in keyword_list:
        keyword_list = keyword_list.replace(
            u"|",
            "~",
        )
    a = Urlchuli(keyword_list, 'gbk')
    one = a.url_bm()
    start_urls = [
        'http://search.sina.com.cn/?c=news&q={}&range=all&time=w&stime=&etime=&num=10'
        .format(one)
    ]  #  time  w:一周,m:月 h:一小时 d:一天

    def parse(self, response):
        a = response.url
        if self.MAX_PAGE is None:
            MAX_PAGE = response.xpath(
                "//span[@class ='pagebox_cur_page']/text()")
        else:
            MAX_PAGE = self.MAX_PAGE

        if 'https://s.weibo.com/weibo/' in str(a):
            print('搜索失败,请重新搜索')
            raise Exception(f'搜索失败,请重新搜索')

        else:
            div_list = response.xpath("//div[@class='box-result clearfix']")
            for div in div_list:
                data = div.xpath(".//p[@class = 'content']").xpath(
                    'string(.)').extract()
                title = div.xpath(".//h2/a/text()").extract()
                title = ''.join(title)
                href = div.xpath(".//h2/a/@href").extract_first()
                time = div.xpath(
                    ".//span[@class = 'fgray_time']/text()").extract_first()
                time = re.split(r' ', time)
                time = time[-2] + ' ' + time[-1]
                self.r.append(href)
                yield scrapy.Request(url=href,
                                     meta={
                                         "intro": data,
                                         'href': href,
                                         'time': time,
                                         'title': title
                                     },
                                     callback=self.parse_main,
                                     dont_filter=True)
            next_url = response.xpath(
                "//a[@title = '下一页']/@href").extract_first()
            next_url = urllib.parse.urljoin(response.url, next_url)
            page = response.xpath(
                "//span[@class = 'pagebox_cur_page']/text()").extract_first()
            if int(page) is int(MAX_PAGE):
                print('页数上限')
            else:
                yield scrapy.Request(next_url,
                                     callback=self.parse,
                                     dont_filter=True)

    def parse_main(self, response):
        item = XinLangspider1Item()
        item['intro'] = str(response.meta["intro"]).replace(
            u"...",
            "",
        ).replace(
            u"']",
            "",
        ).replace(
            u"['",
            "",
        )
        item['href'] = response.meta["href"]
        item['time'] = response.meta['time']
        item['title_main'] = response.meta['title']
        item['article'] = response.xpath(
            "//div[@id = 'artibody']//p//text()|//div[@id = 'article']//p//text()"
        ).extract()
        item['source'] = response.xpath(
            "//a[@class = 'source ent-source']/text()|//span[@class = 'source ent-source']/text()"
        ).extract()
        item['TID'] = None

        a = re.findall(r'http.{1,}sina', item['href'])[0][7:-5]
        a = a.replace(
            u"/",
            "",
        )

        if a in 'k':
            item['TID'] = re.findall(r'article_.{1,}_', item['href'])[0][8:-1]
        else:
            item['TID'] = re.findall(r'-ih.{1,}shtml', item['href'])[0][1:-6]

        if a in xw_type.cs:
            item['source'] = response.xpath(
                "//span[@id = 'art_source']/text()").extract()
            item['article'] = response.xpath(
                "//div[@class = 'article-body main-body']//p//text()").extract(
                )
        elif a in xw_type.ss:
            item['source'] = response.xpath(
                "//a[@class = 'source content-color']/text()|//span[@class ='source content-color']/text()"
            ).extract()
        elif a in xw_type.xw:
            item['article'] = response.xpath("//div[@id = 'article']").xpath(
                'string(.)').extract()
            item['source'] = response.xpath(
                "//a[@class = 'source']/text()").extract()
        elif a in xw_type.bk:
            item['source'] = '新浪博客'
            item['article'] = response.xpath(
                "//div[@id='sina_keyword_ad_area2']/div/font|//div[@id='sina_keyword_ad_area2']/p/font"
            ).xpath('string(.)').extract()

        # 手机版网站
        if len(item['article']) == 0 and len(item['source']) == 0:
            item['article'] = response.xpath(
                "//section[@class = 'art_pic_card art_content']/p//text()"
            ).extract()
            item['source'] = response.xpath(
                "//h2[@class ='weibo_user']/text()").extract()

        yield item
        article = Article(tid=item['TID'],
                          channel_id=3,
                          title=item['title_main'],
                          content=item['article'],
                          publish_datetime=item['time'],
                          url=item['href'],
                          author_name=item['source'],
                          digest=item['intro'])

        self.R.append(article)
        if len(self.r) == len(self.R):
            print(len(self.R))
            print('开始保存数据库')
            print('爬虫结束,开始热度分析')
            SARunner().article_List(self.R)
Exemplo n.º 7
0
class RMWSpider(scrapy.Spider):
    name = 'rmw'
    allowed_domains = ['people.com', 'people.com.cn.']
    custom_settings = {
        'ITEM_PIPELINES': {
            'crawler.pipelines.RenMingPipeline': 300,
        },
    }
    r = []
    args = SARunner().parser()
    keyword_list = SARunner().keyworld_list(args.anaentities)
    keyword_list = re.split(r'\|', keyword_list)
    R = []
    R2 = 0
    R1 = len(keyword_list)

    headers = {
        'Location': 'news/getNewsResult.jsp',
        'Server': 'Apache-Coyote/1.1',
    }

    def start_requests(self):
        print('爬取关键词', self.keyword_list)
        for keyword in self.keyword_list:
            keyword = keyword.encode('gbk')
            print('正在搜索...')
            url = 'http://search.people.com.cn/cnpeople/search.do'
            formdata = {
                'siteName': 'news',
                'pageNum': '1',
                'facetFlag': 'true',
                'nodeType': 'belongsId',
                'nodeId': '0',
                'keyword': keyword,
            }
            yield scrapy.FormRequest(url=url,
                                     formdata=formdata,
                                     headers=self.headers,
                                     callback=self.parse_seek,
                                     dont_filter=True)

    def parse_seek(self, response):
        if response.url == 'http://search.people.com.cn/cnpeople/news/error.jsp':
            print('搜索失败')
        else:
            print(response.url)
            ul_list = response.xpath("//div[@class='fr w800']/ul")
            for ul in ul_list:
                item = {}
                item['title'] = ul.xpath("./li[1]//a").xpath(
                    'string(.)').extract()
                item['time'] = ul.xpath("./li[3]/text()").extract_first()
                item['intro'] = ul.xpath("./li[2]").xpath(
                    'string(.)').extract()
                item['href'] = ul.xpath("./li[1]//a/@href").extract_first()
                self.R.append(item['href'])
                yield scrapy.Request(item['href'],
                                     callback=self.parse_main,
                                     meta={
                                         'title': item['title'],
                                         'time': item['time'],
                                         'intro': item['intro'],
                                         'href': item['href']
                                     },
                                     dont_filter=True)
            next_url = response.xpath(
                "//a[text() = '下一页']/@href").extract_first()
            next_url = urllib.parse.urljoin(response.url, next_url)

            num_page = response.xpath(
                "//div[@class = 'show_nav_bar']/text()").extract()
            try:
                num_page = ''.join(num_page)
                num_page = re.findall(r"\d+", num_page)[0]
            except IndexError as e:
                pass
            self.R2 += 1

            if RMW_MAX_PAGE is not None:
                if int(num_page) == RMW_MAX_PAGE:
                    if self.R1 == self.R2:
                        print('页数上限')
                else:
                    yield scrapy.Request(next_url,
                                         callback=self.parse_seek,
                                         dont_filter=True)
            else:
                yield scrapy.Request(next_url,
                                     callback=self.parse_seek,
                                     dont_filter=True)

    def parse_main(self, response):
        item = RMWspider1Item()
        item['title'] = response.meta['title'][0]
        item['time'] = response.meta['time']
        item['intro'] = response.meta['intro'][0].replace('[', '', 1).replace(
            ']',
            '',
        )
        item['href'] = response.meta['href']
        item['TID'] = re.findall(r'/c.{1,}html', item['href'])[0][1:-5]
        if 'people' in item['TID']:
            item['TID'] = re.findall(r'/c.{1,}', item['TID'])[0][1:]
        item['source'] = response.xpath(
            "//div[@class = 'artOri']/a/text()|"
            "//div[@class='box01']//a/text()|"
            "//div[@class='text_c']/p//a/text()|"
            "//div[@class = 'msgBox']//a/text()|"
            "//div[@class = 'page_c']/div[@class = 'fr']/a/text()|"
            "//div[@class = 'w1000 p2']//a/text()|"
            "//div[@class = 'p2j_text fl']/h2/a/text()").extract_first()
        item['article'] = response.xpath(
            "//div[@id='rwb_zw']//p|"
            "//div[@class='show_text']//p|"
            "//div[@class='artDet']//p|"
            "//div[@class='text_con clearfix']//p|"
            "//div[@class = 'content clear clearfix']//p|"
            "//div[@id = 'p_content']//p|"
            "//div[@class = 'box_con']//p|"
            "//div[@class = 'text_show']//p|"
            "//div[@class = 'gray box_text']//p|"
            "//div[@class = 'text_box clearfix']//p").xpath(
                'string(.)').extract()
        yield item
        article = Article(tid=item['TID'],
                          channel_id=5,
                          title=item['title'],
                          content=item['article'],
                          publish_datetime=item['time'],
                          url=item['href'],
                          author_name=item['source'],
                          digest=item['intro'])
        self.r.append(article)
        if len(self.R) == len(self.r):
            print(len(self.r))
            print('爬虫结束,开始热度分析')
            SARunner().article_List(self.r)
Exemplo n.º 8
0
class TbSpider(scrapy.Spider):
    name = 'tb'
    allowed_domains = ['tieba.baidu.com/mo/q']
    custom_settings = {
        'ITEM_PIPELINES': {
            'crawler.pipelines.BaiDuTBPipeline': 300,
        },
    }
    r = []
    R = []
    MAX_PAGE = TB_MAX_PAGE
    # start_urls = ['http://tieba.baidu.com/mo/q----,sz@320_240-1-3---2/m?kw=吧名&pn=0']
    """
    url = 'http://wap.baidu.com/sf/vsearch?pd=tieba&word=%E4%B8%AD%E5%B1%B1%E5%A4%A7%E5%AD%A6&tn=vsearch&sa=vs_tab&lid=8756617510026267405&ms=1'
    可以使用此url重新编写
    """

    args = SARunner().parser()
    keyword_list = SARunner().keyworld_list(args.anaentities)
    keyword_list = re.split(r'\|', keyword_list)
    p = 0
    P = len(keyword_list)

    def start_requests(self):
        for keyword in self.keyword_list:
            url = "http://tieba.baidu.com/f/search/res?ie=utf-8&qw={}".format(
                keyword)
            yield scrapy.FormRequest(url=url,
                                     callback=self.parse_detail,
                                     dont_filter=True)

    def parse_detail(self, response):
        print(response.url)
        div_list = response.xpath(
            "//div[@class = 's_post_list']/div[@class = 's_post']")
        for div in div_list:
            item = BaidutiebaItem()
            item['title'] = div.xpath(
                "./span[@class='p_title']/a[@class='bluelink' and @data-fid]"
            ).xpath('string(.)').extract()
            item['time'] = div.xpath(
                ".//font[@class='p_green p_date']/text()").extract_first()
            item['intro'] = div.xpath(".//div[@class = 'p_content']").xpath(
                'string(.)').extract()
            item['href'] = div.xpath(
                "./span[@class='p_title']/a[@class='bluelink' and @data-fid]/@href"
            ).extract_first()
            item['href'] = urllib.parse.urljoin(response.url, item['href'])
            item['source'] = div.xpath("./text()|.//a//font//text()").extract()
            item['source'] = ''.join(item['source'])

            if item['time'] is None:  # 过滤掉贴吧信息
                continue
            self.r.append(item['href'])
            yield scrapy.Request(item['href'],
                                 callback=self.parse_main,
                                 meta={'item': item},
                                 dont_filter=True)
        self.p += 1
        num_page = response.xpath(
            "//span[@class='cur']/text()").extract_first()
        max_page_url = response.xpath(
            "//a[text() = '尾页']/@href").extract_first()
        if self.MAX_PAGE is None and max_page_url is not None:
            self.MAX_PAGE = re.findall(r'&pn=.{1,}', max_page_url)[0][4:]

        if int(num_page) == self.MAX_PAGE:
            if self.p == self.P:
                print('页数上限')
        else:
            next_url = response.xpath(
                "//a[text() = '下一页>']/@href").extract_first()
            next_url = urllib.parse.urljoin(response.url, next_url)
            yield scrapy.Request(next_url,
                                 callback=self.parse_detail,
                                 dont_filter=True)

    def parse_main(self, response):
        item = response.meta['item']
        item['reply'] = response.xpath(
            "//div[@id='thread_theme_5']//span[@class='red'][1]/text()"
        ).extract()
        yield item
        self.R.append(item)
        if len(self.r) == len(self.R):
            print('开始保存数据库')
Exemplo n.º 9
0
class XHWSpider(scrapy.Spider):
    name = 'xhw'
    allowed_domains = ['so.news.cn']
    custom_settings = {
        'ITEM_PIPELINES': {
            'crawler.pipelines.RenMingPipeline': 300,
        },
    }
    args = SARunner().parser()
    keyword_list = SARunner().keyworld_list(args.anaentities)
    keyword_list = re.split(r'\|', keyword_list)
    p = len(keyword_list)
    page = 1
    R = []
    r = []

    def start_requests(self):
        print('正在搜索...')
        keyWordAll = self.keyword_list[0]
        if self.p > 1:
            keyWordOne = self.keyword_list[1:]
            keyWordOne = '+'.join(keyWordOne)
            url = 'http://so.news.cn/getNews?keyWordAll={}&keyWordOne={}&keyWordIg=&searchFields=0&sortField=0&url=&senSearch=1&lang=cn&keyword={}&curPage=1'.format(
                keyWordAll, keyWordOne, keyWordAll)
            print(url)
        else:
            url = 'http://so.news.cn/getNews?keyword={}&curPage=1&sortField=0&searchFields=1&lang=cn'.format(
                keyWordAll)
        yield scrapy.Request(url=url,
                             callback=self.parse_seek,
                             dont_filter=True)

    def parse_seek(self, response):
        html = json.loads(response.text)
        data_list = html['content']['results']
        max_page = html['content']['pageCount']
        for data in data_list:
            item = XHWspider1Item()
            item['title'] = data['title'].replace(
                u'<font color=red>',
                '').replace(u'</font>',
                            '').replace(u'&nbsp',
                                        '').replace(u'&quot',
                                                    '').replace(u'\u3000', '')
            # item['title'] = item['title'].replace(u'<font color=red>', '')
            item['time'] = data['pubtime']
            item['href'] = data['url']
            item['intro'] = data['des']
            if 'xhwkhdapp' in item['href']:
                continue
            if item['intro'] is not None:
                item['intro'] = ''.join(item['intro'])
                item['intro'] = item['intro'].replace(u'<font', '').replace(
                    u'color=red>', '').replace(u'</font>', '')
            item['source'] = data['sitename']
            self.R.append(item['href'])
            yield scrapy.Request(url=item['href'],
                                 callback=self.parse_main,
                                 dont_filter=True,
                                 meta={'item': deepcopy(item)})

        if XHW_MAX_PAGE is not None:
            max_page = XHW_MAX_PAGE

        if self.page == max_page:
            print('页数上限')
        else:
            self.page += 1
            a = re.compile('&curPage=\d+')
            next_url = a.sub('&curPage={}'.format(self.page), response.url)
            yield scrapy.Request(url=next_url,
                                 callback=self.parse_seek,
                                 dont_filter=True)

    def parse_main(self, response):
        item = response.meta['item']
        item['article'] = response.xpath(
            "//div[@class ='p-right left']//div[@id='p-detail']//p|"
            "//div[@id='content']//p|"
            "//div[@class='content']//p|"
            "//div[@class ='contant clearfix']/div[@class ='xl']//p|"
            "//div[@id ='Content']//p|"
            "//div[@class ='zj_left']/div[@class ='zj_nr']//p|"
            "//td[@class='text_con_16_33']//p|"
            "//div[@class ='content pack']//p|"
            "//div[@class = 'article']//p|"
            "//div[@class ='main-content-box']//p|"
            "//div[@id ='nr_wz']//p").xpath('string(.)').extract()
        item['TID'] = re.findall(r'c_.{1,}htm', item['href'])[0][2:-4]
        yield item
        article = Article(tid=item['TID'],
                          channel_id=11,
                          title=item['title'],
                          content=item['article'],
                          publish_datetime=item['time'],
                          url=item['href'],
                          author_name=item['source'],
                          digest=item['intro'])
        self.r.append(article)
        if len(self.r) == len(self.R):
            print(len(self.r))
            print('爬虫结束,开始热度分析')
            SARunner().article_List(self.r)