예제 #1
0
파일: lelebbs.py 프로젝트: liuhuan21/ACCS
    def parse(self, response):
        # 选择器
        sel = Selector(response)
        item = CpsecspidersItem()
        # 文章url列表
        article_url = sel.xpath('//dt//p/a/@href').extract()
        # 下一页地址
        next_page_url = sel.xpath(
            "//a[@class='paginate'][last()]/@href").extract()

        for url in article_url:
            # 拼接url
            urll = urljoin(self.baseurl, url)
            # 调用parse_item解析文章内容
            request = scrapy.Request(urll, callback=self.parse_item)
            request.meta['item'] = item
            yield request

        if next_page_url[0]:
            # 调用自身进行迭代
            n = next_page_url[0][-2:-1]
            t = list(next_page_url[0])
            if int(n) > 0:
                t[8] = str(int(n) - 1)
                tt = ''.join(t)
                request = scrapy.Request(urljoin(self.baseurl, tt),
                                         callback=self.parse)
                yield request
예제 #2
0
    def parse(self, response):
        # 选择器
        sel = Selector(response)
        item = CpsecspidersItem()
        # 文章url列表
        article_url = sel.xpath(
            '//div[@class="mt5"]/table[@class="tab-bbs-list tab-bbs-list-2"]//tr[@class="bg"]/td[1]/a/@href'
        ).extract()
        # 下一页地址
        next_page_url = sel.xpath(
            "//div[@class='short-pages-2 clearfix']/div[@class='links']/a[last()]/@href"
        ).extract()

        for url in article_url:
            # 拼接url
            urll = urljoin(self.baseurl, url)
            # 调用parse_item解析文章内容
            request = scrapy.Request(urll, callback=self.parse_item)
            request.meta['item'] = item
            yield request

        if next_page_url[0]:
            # 调用自身进行迭代
            request = scrapy.Request(urljoin(self.baseurl, next_page_url[0]),
                                     callback=self.parse)
            yield request
예제 #3
0
    def parse_item(self, response):
        import time
        time = time.strftime("%Y.%m.%d", time.localtime())
        content = ''
        sel = Selector(response)
        item = response.meta['item']
        l = ItemLoader(item=CpsecspidersItem(), response=response)

        article_url = str(response.url)
        article_name = sel.xpath(
            '//div[@id="post_head"]/h1/span/span/text()').extract()
        article_content = sel.xpath(
            '//div[@class="atl-main"]//div/div[@class="atl-content"]/div[2]/div[1]/text()'
        ).extract()
        article_author = sel.xpath(
            "//a[@class='js-vip-check']/text()").extract()
        article_clik_num = sel.xpath(
            'substring-after(//div[@class="atl-info"]/span[3]/text(),":")'
        ).extract()
        article_reply_num = sel.xpath(
            'substring-after(//div[@class="atl-info"]/span[4]/text(),":")'
        ).extract()

        # 文章内容拼起来
        for i in article_content:
            content = content + i
        # 如果文章名为空的情况
        if not article_name:
            article_name = "无名"
        article_name = article_name
        content = content
        article_url = article_url
        article_author = article_author[0]
        click_num = article_clik_num[0]
        reply_num = article_reply_num[0]

        l.add_value('title', article_name)
        l.add_value('content', content)
        l.add_value('url', article_url)
        l.add_value('reply', reply_num)
        l.add_value('click', click_num)
        l.add_value('uname', article_author)
        l.add_value('source', "天涯论坛-养宠心情")
        l.add_value('typeid', 0)
        l.add_value('datetime', time)
        l.add_value('EmotionalScore', 0)
        yield l.load_item()
예제 #4
0
파일: cpnbbs.py 프로젝트: liuhuan21/ACCS
    def parse_item(self, response):
        import time
        time = time.strftime("%Y.%m.%d", time.localtime())
        content = ''
        sel = Selector(response)
        item = response.meta['item']
        l = ItemLoader(item=CpsecspidersItem(), response=response)

        article_url = str(response.url)
        article_name = sel.xpath('//a[@class="maintitle"]/text()').extract()
        article_content = sel.xpath(
            '//table[@class="attachtable"]//text()').extract()
        article_author = sel.xpath(
            "//td[@class='row1'][1]/span[@class='postdetails']/text()[1]"
        ).extract()
        article_clik_num = sel.xpath(
            'substring-after(//div[@class="atl-info"]/span[3]/text(),":")'
        ).extract()
        article_reply_num = sel.xpath(
            'substring-after(//div[@class="atl-info"]/span[4]/text(),":")'
        ).extract()
        # print(article_name)
        # 文章内容拼起来
        for i in article_content:
            content = content + i
        # 如果文章名为空的情况
        if not article_name:
            article_name = "无名"
        article_name = article_name
        content = content
        article_url = article_url
        article_author = article_author[0]
        click_num = article_clik_num[0]
        reply_num = article_reply_num[0]

        l.add_value('title', article_name)
        l.add_value('content', content)
        l.add_value('url', article_url)
        l.add_value('reply', reply_num)
        l.add_value('click', click_num)
        l.add_value('uname', article_author)
        l.add_value('source', "cpn论坛-百鸟园")
        l.add_value('typeid', 0)
        l.add_value('datetime', time)
        l.add_value('EmotionalScore', 0)
        yield l.load_item()
예제 #5
0
    def parse_item(self, response):
        import time
        time = time.strftime("%Y.%m.%d",time.localtime())
        content = ''
        sel = Selector(response)
        item = response.meta['item']
        l = ItemLoader(item=CpsecspidersItem(), response=response)

        article_url = str(response.url)
        article_name = sel.xpath("//h1[@class='core_title_txt  ']/text()").extract()
        article_content = sel.xpath(
            "//div[@class='p_content  p_content p_content_nameplate']/cc//text()").extract()
        article_author = sel.xpath("substring-after(//a[@class='p_author_name j_user_card']/text(),'')").extract()
        article_clik_num = sel.xpath('substring-after(//div[@class="atl-info"]/span[3]/text(),":")').extract()
        article_reply_num = sel.xpath('substring-after(//div[@class="atl-info"]/span[4]/text(),":")').extract()
        # 文章内容拼起来
        for i in article_content:
            content = content + i
        # 如果文章名为空的情况
        if not article_name:
            article_name="无名"
        article_name = article_name
        content = content
        article_url = article_url
        article_author = article_author[0]
        click_num = article_clik_num[0]
        reply_num = article_reply_num[0]

        l.add_value('title', article_name)
        l.add_value('content', content)
        l.add_value('url', article_url)
        l.add_value('reply', reply_num)
        l.add_value('click', click_num)
        l.add_value('uname', article_author)
        l.add_value('source', "百度贴吧")
        l.add_value('typeid', 0)
        l.add_value('datetime', time)
        l.add_value('EmotionalScore', 0)
        yield l.load_item()
예제 #6
0
    def parse(self, response):
        # 选择器
        sel = Selector(response)
        item = CpsecspidersItem()
        # 文章url列表
        article_url = sel.xpath(
            '//a[@class="j_th_tit "]/@href').extract()
        # 下一页地址
        next_page_url = sel.xpath('//*[@id="frs_list_pager"]/a[10]/@href').extract()
        for url in article_url:
            # 拼接url
            urll = urljoin(self.baseurl, url)
            # 调用parse_item解析文章内容
            request = scrapy.Request(urll, callback=self.parse_item)
            request.meta['item'] = item
            yield request

        if next_page_url[0]:
            # 调用自身进行迭代
            print(urljoin(self.baseurl, next_page_url[0]))
            request = scrapy.Request(urljoin(self.baseurl, next_page_url[0]), callback=self.parse)
            yield request
예제 #7
0
파일: cpnbbs.py 프로젝트: liuhuan21/ACCS
    def parse(self, response):
        # 选择器
        sel = Selector(response)
        item = CpsecspidersItem()
        # 文章url列表
        article_url = sel.xpath(
            '//span[@class="topictitle"]/a/@href').extract()
        # 下一页地址
        next_page_url = sel.xpath(
            '//td[@nowrap="nowrap"]/span[@class="nav"]/a[1]/@href').extract()
        for url in article_url:
            # 拼接url
            urll = urljoin(self.baseurl, url)
            # 调用parse_item解析文章内容
            request = scrapy.Request(urll, callback=self.parse_item)
            request.meta['item'] = item
            yield request

        if next_page_url[0]:
            # 调用自身进行迭代
            request = scrapy.Request(urljoin(self.baseurl, next_page_url[0]),
                                     callback=self.parse)
            yield request