Пример #1
0
    def parse_img(self, response):
        imgItem = DemospiderItem()
        selector = Selector(response)
        img_url = selector.xpath(
            '//div[@class="picsbox picsboxcenter"]/p/a/img/@src'
        ).extract_first()
        imgItem['image_url'] = img_url
        imgItem['image_name'] = img_url[-18:-4]

        yield imgItem
Пример #2
0
    def parse(self, response):
        url = response.url
        if "baijiahao" in url: 
            yield scrapy.Request(url, callback=self.parse_baijiahao_contents)
        elif "new.qq.com" in url:
            yield scrapy.Request(url, callback=self.parse_tencent_contents)
        elif "news.ifeng.com" in url:
            yield scrapy.Request(url, callback=self.parse_ifeng_contents)
        elif "news.163.com" in url:
            yield scrapy.Request(url, callback=self.parse_163_news_contents)
        elif "3g.163.com" in url:
            yield scrapy.Request(url, callback=self.parse_3g_163_contents)
        elif "thepaper.cn" in url:
            yield scrapy.Request(url, callback=self.parse_pengpai_contents)
        elif "news.sina.com.cn" in url:
            yield scrapy.Request(url, callback=self.parse_sina_contents)
        elif "paper.people.com.cn" in url:
            yield scrapy.Request(url, callback=self.parse_paper_people_contents)
        elif "xinhuanet.com" in url:
            yield scrapy.Request(url, callback=self.parse_xinhuanet_contents)
        elif "bmnh.org.cn" in url:
            yield scrapy.Request(url, callback=self.parse_bmnh_contents)
        elif "capitalmuseum" in url:
            yield scrapy.Request(url, callback=self.parse_capital_museum_contents)
        elif "cstm.cdstm.cn" in url:
            yield scrapy.Request(url, callback=self.parse_cstm_contents)
        elif "luxunmuseum" in url:
            yield scrapy.Request(url, callback=self.parse_luxunmuseum_contents)
        elif "jb.mil.cn" in url:
            yield scrapy.Request(url, callback=self.parse_military_museum_contents)
        elif "gmc" in url:
            yield scrapy.Request(url, callback=self.parse_gmc_contents)
        else:
            item = DemospiderItem()
            cursor = self.mydatabase.cursor()
            sql = "select content from new where url='{}'".format(url)
            cursor.execute(sql)
            result = cursor.fetchone()
            cursor.close()

            id = self.getNewsID(url)
            if id != []:
                item['news_id'] = id
            else:
                item['news_id'] = -1
            item['content'] = result[0]
            yield item
Пример #3
0
    def parse_xinhuanet_contents(self, response):
        content = response.xpath('//div[@id="p-detail"]', encoding="UTF-8")
        content = content.xpath('//p/text()').extract()

        content_string = ""
        for item in content:
            if item != "":
                content_string += item.replace('\xa0', '').replace('\xa9', '')

        content_string = content_string.replace('\n', "")
        content_string = content_string.replace(" ", "")
        content_string =  content_string.replace("\u3000", "")
        
        item = DemospiderItem()
        item['news_id'] = self.getNewsID(response.url)
        item['content'] = content_string

        yield item
Пример #4
0
    def parse_paper_people_contents(self, response):
        content = response.xpath('//div[@class="content"]/div[@class="right_c"]', encoding="UTF-8")
        content = content.xpath('//p/text()').extract()

        content_string = ""
        for item in content:
            if item != "":
                content_string += item.replace('\xa0', '').replace('\xa9', '')

        content_string = content_string.replace('\n', "")
        content_string = content_string.replace(" ", "")
        content_string = content_string.replace("\u3000", "")
        
        item = DemospiderItem()
        item['news_id'] = self.getNewsID(response.url)
        item['content'] = content_string

        yield item
Пример #5
0
    def parse_pengpai_contents(self, response):
        content = response.xpath('//*[@id="root"]/div/div[3]/div[1]/div[1]/div[3]/div/div[1]', encoding="UTF-8")
        content = content[0].xpath('//p/text()').extract()

        content_string = ""
        for item in content:
            if item != "":
                content_string += item.replace('\xa0', '').replace('\xa9', '')
        
        content_string = content_string.replace('\n', "")
        content_string = content_string.replace(" ", "")
        content_string =  content_string.replace("\u3000", "")

        item = DemospiderItem()
        item['news_id'] = self.getNewsID(response.url)
        item['content'] = content_string

        yield item
Пример #6
0
    def parse_ifeng_contents(self, response):
        content = response.xpath("//div[@class='text-3zQ3cZD4']", encoding="UTF-8")
        content = content.xpath("//p/text()").extract()

        content_string = ""
        for item in content:
            if item != "":
                content_string += item.replace('\xa0', '').replace('\xa9', '')

        content_string = content_string.replace('\n', "")
        content_string = content_string.replace(" ", "")
        content_string =  content_string.replace("\u3000", "")

        item = DemospiderItem()
        item['news_id'] = self.getNewsID(response.url)
        item['content'] = content_string

        yield item
Пример #7
0
    def parse_tencent_contents(self, response):
        content = response.xpath('/html/body', encoding="UTF-8")
        content = content.xpath('//p[@class="one-p"]/text()').extract()

        content_string = ""
        for item in content:
            if item != "":
                content_string += item.replace('\xa0', '').replace('\xa9', '')

        content_string = content_string.replace('\n', "")
        content_string = content_string.replace(" ", "")
        content_string =  content_string.replace("\u3000", "")

        item = DemospiderItem()
        item['news_id'] = self.getNewsID(response.url)
        item['content'] = content_string

        yield item
Пример #8
0
    def parse_gmc_contents(self, response):
        content = response.xpath('//div[@class="article-cont"]')
        content = content.xpath('//p/span/text()').extract()

        content_string = ""
        for item in content:
            if item != "":
                content_string += item.replace('\xa0', '').replace('\xa9', '')

        content_string = content_string.replace('\n', "")
        content_string = content_string.replace(" ", "")
        content_string = content_string.replace("\u3000", "")
        content_string = content_string.replace("\xa0", "").replace("\r", "")

        item = DemospiderItem()
        item['news_id'] = self.getNewsID(response.url)
        item['content'] = content_string

        yield item
Пример #9
0
    def parse_capital_museum_contents(self, response):
        content = response.xpath("//span[@class='wcontent']")
        content = content.xpath("//p/text()").extract()

        content_string = ""
        for item in content:
            if item != "":
                content_string += item.replace('\xa0', '').replace('\xa9', '')

        content_string = content_string.replace('\n', "")
        content_string = content_string.replace(" ", "")
        content_string = content_string.replace("\u3000", "")
        content_string = content_string.replace("\xa0", "").replace("\r", "")

        item = DemospiderItem()
        item['news_id'] = self.getNewsID(response.url)
        item['content'] = content_string

        yield item
Пример #10
0
    def parse_baijiahao_contents(self, response):
        # 从html中提取出所有的正文内容
        content = response.xpath('/html/body/div[@id = "detail-page"]/div[@id="content-container"]', encoding='UTF-8')
        content = content.xpath('//span[@class="bjh-p"]/text()', encoding="UTF-8").extract()
        
        # 将正文的所有句子连接成字符串
        content_string = ""
        for item in content:
            if item != "":
                content_string += item.replace('\xa0', '').replace('\xa9', '')

        # 去掉正文中的换行符和空格
        content_string = content_string.replace('\n', "")
        content_string = content_string.replace(" ", "")
        content_string = content_string.replace("\u3000", "")
        
        # 将获取到的字符串存入item中
        item = DemospiderItem()
        item['news_id'] = self.getNewsID(response.url)
        item['content'] = content_string
        
        self.id += 1
        yield item