Пример #1
0
    def parse_detail(self, response):
        item = ScrapywebItem()
        item['title'] = response.xpath(
            '//div[@class="sherry_title"]/h1/text()').extract()[0]
        item['summary'] = response.xpath(
            '//div[@class="introduce"]/text()').extract()[0]

        content = ''
        #a_rs = re.compile(r"<a[^>]*>|<\/a>", re.S)

        for con in response.xpath('//div[@id="content"]/div').extract():
            #con = a_rs.sub('',con)#去掉内容的a标签
            content = content + con

        item['content'] = content
        item['pubtime'] = ''

        imageurl = []

        for img in response.xpath(
                '//div[@id="content"]/div/img/@src').extract():
            imageurl.append(img)

        print(imageurl)
        item['cover'] = imageurl[0]
        item['imageUrl'] = imageurl
        item['source_url'] = response.url
        item['source'] = "墙布窗帘网"

        yield item
Пример #2
0
    def parse_detail(self, response):
        item = ScrapywebItem()
        item['title'] = response.xpath('//h1/text()').extract()[0]
        item['summary'] = response.xpath(
            '//div[@class="content"]/p/span/text()').extract()[0]

        content = ''
        a_rs = re.compile(r"<a[^>]*>|<\/a>", re.S)

        for con in response.xpath('//div[@class="content"]/p').extract():
            con = a_rs.sub('', con)  #去掉内容的a标签
            #con = "<p>" + con + "</p>"
            content = content + con

        item['content'] = content
        item['pubtime'] = ''

        imageurl = []

        for img in response.xpath(
                '//div[@class="content"]/p/span/img/@src').extract():
            imageurl.append(img)

        for img in response.xpath(
                '//div[@class="content"]/p/img/@src').extract():
            imageurl.append(img)

        item['cover'] = imageurl[0]
        item['imageUrl'] = imageurl
        item['source_url'] = response.url
        item['source'] = "顶墙集成网"

        yield item
Пример #3
0
    def parse_detail(self, response):
        item = ScrapywebItem()
        #item['title'] = response.xpath('//div[@class="g-main-855 fl"]/div[@class="m-news-hd"]/h2/text()').extract()[0]
        # item['summary'] = response.xpath('//div[@class="g-main-855 fl"]/div[@class="m-news-bd js-anchor-newsnav"]/div[@class="m-news-box"]/div[@class="m-news-content"]/p/text()').extract()[0]
        item['title'] = response.xpath(
            '//div[@class="db-ad-structure"]/h1/text()').extract()[0]
        item['summary'] = response.xpath(
            '//div[@class="db-ad-structure"]/p[@class="db-lead"]/text()'
        ).extract()[0]
        content = ''

        a_rs = re.compile(r"<a[^>]*>|<\/a>", re.S)

        #for con in response.xpath('//div[@class="g-main-855 fl"]/div[@class="m-news-bd js-anchor-newsnav"]/div[@class="m-news-box"]/div[@class="m-news-content"]/p').extract():
        for con in response.xpath('//div[@class="db-detail"]/div/p').extract():
            con = a_rs.sub('', con)  #去掉内容的a标签
            content = content + con

        item['content'] = content
        item['pubtime'] = ''

        imageurl = []

        for img in response.xpath(
                '//div[@class="db-detail"]/div/p/img/@original').extract():
            imageurl.append(img)

        item['imageUrl'] = imageurl
        item['cover'] = response.meta['cover']
        item['source_url'] = response.url
        item['source'] = "中华地板网"

        yield item
Пример #4
0
    def parse_detail(self, response):
        item = ScrapywebItem()
        item['title'] = response.xpath(
            '//h1[@class="article-title"]/text()').extract()[0]
        item['summary'] = response.xpath(
            '//p[@class="describe"]/text()').extract()[0]

        content = ''
        for con in response.xpath(
                '//div[@class="article-content fontSizeSmall BSHARE_POP"]/p'
        ).extract():
            #con = a_rs.sub('',con)#去掉内容的a标签
            content = content + con

        item['content'] = content
        item['pubtime'] = response.xpath(
            '//span[@class="date"]/text()').extract()[0]

        imageurl = []

        for img in response.xpath(
                '//div[@class="article-content fontSizeSmall BSHARE_POP"]/p/img/@src'
        ).extract():
            imageurl.append(img)

        item['cover'] = imageurl[0]
        item['imageUrl'] = imageurl
        item['source_url'] = response.url
        item['source'] = "搜门网"

        yield item
Пример #5
0
    def parse_detail(self, response):
        item = ScrapywebItem()
        item['title'] = response.xpath('//h1/text()').extract()[0]
        item['summary'] = response.xpath(
            '//div[@class="infor_detail"]/p/text()').extract()[0]

        content = ''
        a_rs = re.compile(r"<a[^>]*>|<\/a>", re.S)

        for con in response.xpath('//div[@class="infor_detail"]/p').extract():
            con = a_rs.sub('', con)  #去掉内容的a标签
            #con = "<p>" + con + "</p>"
            content = content + con

        item['content'] = content
        item['pubtime'] = ''

        imageurl = []
        domain = 'http://www.chinazns.com'

        for img in response.xpath(
                '//div[@class="infor_detail"]/p/img/@src').extract():
            item['content'] = item['content'].replace(img, domain +
                                                      img)  #替换html 内的图片
            imageurl.append(domain + img)

        item['cover'] = domain + response.meta['cover']
        item['imageUrl'] = imageurl
        item['source_url'] = response.url
        item['source'] = "智能锁网"

        yield item
Пример #6
0
    def parse_detail(self, response):
        item = ScrapywebItem()
        item['title'] = response.xpath('//div[@class="main-left fl"]/h1/text()').extract()[0]
        item['summary'] = response.xpath('//div[@id="digest"]/p/text()').extract()[0]

        content = ''

        a_rs = re.compile(r"<a[^>]*>|<\/a>", re.S)
   
        #for con in response.xpath('//div[@class="g-main-855 fl"]/div[@class="m-news-bd js-anchor-newsnav"]/div[@class="m-news-box"]/div[@class="m-news-content"]/p').extract():
        for con in response.xpath('//div[@id="articleText"]/p').extract():    
            con = a_rs.sub('',con)#去掉内容的a标签
            #con = "<p>" + con + "</p>"
            content = content + con

       
        item['content'] =  content
        item['pubtime'] = ''
 
        imageurl = []

        for img in response.xpath('//div[@id="articleText"]/p/img/@src').extract():
            imageurl.append(img)



        item['cover'] = imageurl[0]
        item['imageUrl'] = imageurl
        item['source_url'] = response.url
        item['source'] = "新浪家居"


        yield item
Пример #7
0
    def parse_detail(self, response):
        item = ScrapywebItem()
        item['title'] = response.xpath('//h1/text()').extract()[0]
        item['summary'] = response.xpath(
            '//div[@class="txtzy"]/text()').extract()

        content = ''
        for con in response.xpath('//div[@id="newdcont"]/p').extract():
            #con = a_rs.sub('',con)#去掉内容的a标签
            content = content + con

        item['content'] = content
        item['pubtime'] = ''
        imageurl = []

        for img in response.xpath(
                '//div[@id="newdcont"]/p/img/@src').extract():
            item['content'] = item['content'].replace(
                img, 'http://www.mumen.com.cn' + img)  #替换html 内的图片
            imageurl.append('http://www.mumen.com.cn' + img)

        item['cover'] = 'http://www.mumen.com.cn/' + response.meta['cover']
        item['imageUrl'] = imageurl
        item['source_url'] = response.url
        item['source'] = "木门网"

        yield item
Пример #8
0
    def parse_detail(self, response):
        item = ScrapywebItem()
        #item['title'] = response.xpath('//div[@class="g-main-855 fl"]/div[@class="m-news-hd"]/h2/text()').extract()[0]
        # item['summary'] = response.xpath('//div[@class="g-main-855 fl"]/div[@class="m-news-bd js-anchor-newsnav"]/div[@class="m-news-box"]/div[@class="m-news-content"]/p/text()').extract()[0]
        item['title'] = response.xpath(
            '//div[@class="post_content_main"]/h1/text()').extract()[0]
        #item['summary'] = response.xpath('//div[@class="m-news-content"]/p/text()').extract()[0]
        item['source_url'] = response.url
        item['source'] = "网易家居"
        content = ''

        a_rs = re.compile(r"<a[^>]*>|<\/a>", re.S)

        #for con in response.xpath('//div[@class="g-main-855 fl"]/div[@class="m-news-bd js-anchor-newsnav"]/div[@class="m-news-box"]/div[@class="m-news-content"]/p').extract():
        for con in response.xpath('//div[@class="post_text"]/p').extract():
            con = a_rs.sub('', con)  #去掉内容的a标签
            content = content + con

        item['content'] = content
        item['pubtime'] = ''

        imageurl = []
        for img in response.xpath(
                '//div[@class="post_text"]/p/img/@src').extract():
            url_data = urlparse(img)
            a_url = url_data.scheme + '://' + url_data.netloc + url_data.path
            imageurl.append(a_url)

        item['cover'] = imageurl[0]

        item['imageUrl'] = imageurl
        item['summary'] = response.meta['summary']

        yield item
Пример #9
0
    def parse_detail(self, response):
        item = ScrapywebItem()
        item['title'] = response.xpath('//h1[@id="post_title"]/text()').extract()[0]
        item['summary'] = response.xpath('//div[@id="post_brief"]/text()').extract()[0]
        item['cover'] = response.meta['cover']
        content = ''

        a_rs = re.compile(r"<a[^>]*>|<\/a>", re.S)

        for con in response.xpath('//div[@id="post_description"]/p').extract():
        	con = a_rs.sub('',con)#去掉内容的a标签
        	content = content + con

        item['content'] = '<p style="text-align: center;"><img src="'+item['cover']+'" /></p>' + content
        
        item['pubtime'] = response.xpath('//div[@id="post_date"]/text()').extract()[0]

        imageurl = []


        #for img in response.xpath('//div[@id="post_description"]/p/img/@src').extract():
        imageurl.append(item['cover'])


        item['imageUrl'] = imageurl
        item['source_url'] = response.url
        item['source'] = "亿欧"

        yield item
Пример #10
0
    def parse_detail(self, response):
        item = ScrapywebItem()
        item['title'] = response.xpath('//h1/text()').extract()[0]
        item['summary'] = response.xpath(
            '//div[@class="infor_detail"]/p/text()').extract()[0]
        a_rs = re.compile(r"<a[^>]*>|<\/a>", re.S)
        link_rs = re.compile(
            r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+'
        )

        content = ''
        #for con in response.xpath('//div[@class="g-main-855 fl"]/div[@class="m-news-bd js-anchor-newsnav"]/div[@class="m-news-box"]/div[@class="m-news-content"]/p').extract():
        content = response.xpath('//div[@class="infor_detail"]').extract()[0]
        more_message = response.xpath(
            '//div[@class="infor_detail"]/div[@class="more_message"]').extract(
            )[0]
        content = content.replace(more_message, '')  # div 中的 infor_detail
        content = a_rs.sub('', content)  #去掉内容的a标签

        item['content'] = content
        item['pubtime'] = ''

        imageurl = []

        for img in response.xpath(
                '//div[@class="infor_detail"]/center/img/@src').extract():
            item['content'] = item['content'].replace(
                img, 'http://www.znjj.tv' + img)  #替换html 内的图片
            imageurl.append('http://www.znjj.tv' + img)

        for img in response.xpath(
                '//div[@class="infor_detail"]/p/img/@src').extract():
            #imageurl.append(img)
            item['content'] = item['content'].replace(
                img, 'http://www.znjj.tv' + img)  #替换html 内的图片
            imageurl.append('http://www.znjj.tv' + img)

        item['imageUrl'] = imageurl
        item['cover'] = 'http://www.znjj.tv' + response.meta['cover']
        item['source_url'] = response.url
        item['source'] = "智家网"

        yield item
Пример #11
0
    def parse_detail(self, response):
        item = ScrapywebItem()
        item['title'] = response.xpath(
            '//div[@class="detail_body"]/h1[@class="text-left"]/text()'
        ).extract()[0]

        a_rs = re.compile(r"<a[^>]*>|<\/a>", re.S)

        content = ''

        for con in response.xpath('//div[@class="detail_content"]').extract():
            con = a_rs.sub('', con)  #去掉内容的a标签
            content = content + con

        item['content'] = content
        item['pubtime'] = ''

        imageurl = []

        for img in response.xpath(
                '//div[@class="detail_content"]/p/img/@src').extract():
            item['content'] = item['content'].replace(
                img, 'http://www.27580.cn' + img)  #替换html 内的图片
            imageurl.append('http://www.27580.cn' + img)
            #url_data = urlparse(img)

            #a_url = url_data.scheme+'://'+url_data.netloc+url_data.path
            #imageurl.append(a_url)

        for img in response.xpath(
                '//div[@class="detail_content"]/div/img/@src').extract():
            item['content'] = item['content'].replace(
                img, 'http://www.27580.cn' + img)  #替换html 内的图片
            imageurl.append('http://www.27580.cn' + img)

        item['summary'] = response.meta['summary']
        item['imageUrl'] = imageurl
        item['cover'] = imageurl[0]
        item['source_url'] = response.url
        item['source'] = "中外涂料网"

        yield item