Пример #1
0
    def parse(self, response):
        news_lists = response.xpath("//div[@class='con2']")[0]
        news_list = news_lists.xpath(".//div[@class='li']")
        for news in news_list:
            title = news.xpath("./a/div/div[@class='t18']/text()")
            time = news.xpath("./a/div/div[@class='time']/text()")
            content = news.xpath("./a/div/div[@class='p']/text()")
            href = news.xpath("./a/@href")
            if len(title) == 0 or len(time) == 0 or len(content) == 0 or len(
                    href) == 0:
                continue
            title = title[0].extract()
            time = time[0].extract()
            content = content[0].extract()
            href = prefixURL + href[0].extract()
            author = "中国地质博物馆"
            description = "1"
            tag = 1
            item = MuseumnewsItem()
            item['title'] = title
            item['author'] = author
            item['time'] = time
            item['description'] = description
            item['content'] = content
            item['url'] = href
            item['tag'] = tag
            yield item

        print('page = {}'.format(self.page))
        if self.page < 20:
            self.page += 1
            new_url = URL.format(page=self.page)
            print(new_url)
            yield Request(new_url, callback=self.parse, dont_filter=True)
Пример #2
0
    def parse(self, response):
        news_body = response.xpath("//div[@class='fen-right float-l']")[0]
        news_list = news_body.xpath(".//ul[@class='fen-right-list']")
        for news in news_list:
            title = news.xpath("./li/span/a/text()")
            time = news.xpath("./li/span[@class='fen-right-time']/text()")
            href = news.xpath("./li/span/a/@href")
            if len(title) == 0 or len(time) == 0 or len(href) == 0:
                continue
            title = title[0].extract()
            time = time[0].extract()
            content = title
            href = prefixURL + href[0].extract()
            author = "中国科学技术馆"
            description = "1"
            tag = 1
            item = MuseumnewsItem()
            item['title'] = title
            item['author'] = author
            item['time'] = time
            item['description'] = description
            item['content'] = content
            item['url'] = href
            item['tag'] = tag
            yield item

        print('page = {}'.format(self.page))
        if self.page <= 45:
            self.page += 1
            new_url = URL.format(page=self.page)
            print(new_url)
            yield Request(new_url, callback=self.parse, dont_filter=True)
Пример #3
0
    def parse(self, response):
        news_body = response.xpath("//div[@class='infoDynamicList']")[0]
        news_list = news_body.xpath("./ul//li")
        for news in news_list:
            title = news.xpath("./a/h3/text()")
            time = news.xpath("./a/span/text()")
            content = news.xpath("./a/p/text()")
            href = news.xpath("./a/@href")
            if len(title) == 0 or len(time) == 0 or len(content) == 0 or len(
                    href) == 0:
                continue
            title = title[0].extract()
            time = time[0].extract()
            content = content[0].extract()
            href = prefixURL + href[0].extract()[1:]
            author = "中国人民革命军事博物馆"
            description = "1"
            tag = 1
            item = MuseumnewsItem()
            item['title'] = title
            item['author'] = author
            item['time'] = time
            item['description'] = description
            item['content'] = content
            item['url'] = href
            item['tag'] = tag
            yield item

        print('page = {}'.format(self.page))
        if self.page < 30:
            self.page += 1
            new_url = URL.format(page=self.page)
            print(new_url)
            yield Request(new_url, callback=self.parse, dont_filter=True)
Пример #4
0
    def parse(self, response):
        news_body = response.xpath("//td[@height='450']")[0]
        news_list = news_body.xpath(".//table[@width='85%']")
        for news in news_list:
            info = news.xpath(".//text()")
            if len(info) == 0:
                continue
            title = info[1].extract()
            time = info[0].extract().replace("\xa0", "")
            content = title
            href = prefixURL + news.xpath(".//@href")[0].extract()
            author = "首都博物馆"
            description = "1"
            tag = 1
            item = MuseumnewsItem()
            item['title'] = title
            item['author'] = author
            item['time'] = time
            item['description'] = description
            item['content'] = content
            item['url'] = href
            item['tag'] = tag
            yield item

        print('page = {}'.format(self.page))
        if self.page < 71:
            self.page += 1
            new_url = URL.format(page=self.page)
            print(new_url)
            yield Request(new_url, callback=self.parse, dont_filter=True)
Пример #5
0
    def parse(self, response):
        news_list = response.xpath('//div[@class="result"]')
        # print(news_list)
        if not news_list:
            self.end = True
            return
        for news in news_list:
            href = news.xpath('./h3[@class="c-title"]/a/@href').extract()
            url = "".join(href).replace("\n", "").replace(" ", "")

            title = news.xpath('./h3[@class="c-title"]/a/text()').extract()
            title = "".join(title).replace("\n", "").replace(" ", "")

            content = news.xpath(
                './div[@class="c-summary c-row "]/text()').extract()
            content = "".join(content).replace("\n", "").replace(" ", "")
            if content == "":
                content = news.xpath(
                    './div[@class="c-summary c-row "]/div[2]/text()').extract(
                    )
            content = "".join(content).replace("\n", "").replace(" ", "")

            author_time = news.xpath(
                './div[@class="c-summary c-row "]//p[@class="c-author"]/text()'
            ).extract()
            author_time = "".join(author_time).replace("\n",
                                                       "").replace(" ",
                                                                   "").split()
            author = ""
            time = ""
            if author_time:  # 有些新闻没有作者和时间
                author = author_time[0]
                s_time = author_time[1]
                if s_time:
                    time = self.parse_time(s_time)
                else:
                    time = s_time

            description = "1"
            tag = 1
            item = MuseumnewsItem()
            item['title'] = title
            item['author'] = author
            item['time'] = time
            item['description'] = description
            item['content'] = content
            item['url'] = url
            item['tag'] = tag
            yield item

        print('page = {}'.format(self.page))
        if not self.end:
            self.page += 1
            new_url = URL.format(museum=self.museum,
                                 bt=self.startTime,
                                 et=self.endTime,
                                 page=self.page * 10)
            print(new_url)
            yield Request(new_url, callback=self.parse, dont_filter=True)