def parse_detail(self, response): item = ScrapywebItem() item['title'] = response.xpath( '//div[@class="sherry_title"]/h1/text()').extract()[0] item['summary'] = response.xpath( '//div[@class="introduce"]/text()').extract()[0] content = '' #a_rs = re.compile(r"<a[^>]*>|<\/a>", re.S) for con in response.xpath('//div[@id="content"]/div').extract(): #con = a_rs.sub('',con)#去掉内容的a标签 content = content + con item['content'] = content item['pubtime'] = '' imageurl = [] for img in response.xpath( '//div[@id="content"]/div/img/@src').extract(): imageurl.append(img) print(imageurl) item['cover'] = imageurl[0] item['imageUrl'] = imageurl item['source_url'] = response.url item['source'] = "墙布窗帘网" yield item
def parse_detail(self, response): item = ScrapywebItem() item['title'] = response.xpath('//h1/text()').extract()[0] item['summary'] = response.xpath( '//div[@class="content"]/p/span/text()').extract()[0] content = '' a_rs = re.compile(r"<a[^>]*>|<\/a>", re.S) for con in response.xpath('//div[@class="content"]/p').extract(): con = a_rs.sub('', con) #去掉内容的a标签 #con = "<p>" + con + "</p>" content = content + con item['content'] = content item['pubtime'] = '' imageurl = [] for img in response.xpath( '//div[@class="content"]/p/span/img/@src').extract(): imageurl.append(img) for img in response.xpath( '//div[@class="content"]/p/img/@src').extract(): imageurl.append(img) item['cover'] = imageurl[0] item['imageUrl'] = imageurl item['source_url'] = response.url item['source'] = "顶墙集成网" yield item
def parse_detail(self, response): item = ScrapywebItem() #item['title'] = response.xpath('//div[@class="g-main-855 fl"]/div[@class="m-news-hd"]/h2/text()').extract()[0] # item['summary'] = response.xpath('//div[@class="g-main-855 fl"]/div[@class="m-news-bd js-anchor-newsnav"]/div[@class="m-news-box"]/div[@class="m-news-content"]/p/text()').extract()[0] item['title'] = response.xpath( '//div[@class="db-ad-structure"]/h1/text()').extract()[0] item['summary'] = response.xpath( '//div[@class="db-ad-structure"]/p[@class="db-lead"]/text()' ).extract()[0] content = '' a_rs = re.compile(r"<a[^>]*>|<\/a>", re.S) #for con in response.xpath('//div[@class="g-main-855 fl"]/div[@class="m-news-bd js-anchor-newsnav"]/div[@class="m-news-box"]/div[@class="m-news-content"]/p').extract(): for con in response.xpath('//div[@class="db-detail"]/div/p').extract(): con = a_rs.sub('', con) #去掉内容的a标签 content = content + con item['content'] = content item['pubtime'] = '' imageurl = [] for img in response.xpath( '//div[@class="db-detail"]/div/p/img/@original').extract(): imageurl.append(img) item['imageUrl'] = imageurl item['cover'] = response.meta['cover'] item['source_url'] = response.url item['source'] = "中华地板网" yield item
def parse_detail(self, response): item = ScrapywebItem() item['title'] = response.xpath( '//h1[@class="article-title"]/text()').extract()[0] item['summary'] = response.xpath( '//p[@class="describe"]/text()').extract()[0] content = '' for con in response.xpath( '//div[@class="article-content fontSizeSmall BSHARE_POP"]/p' ).extract(): #con = a_rs.sub('',con)#去掉内容的a标签 content = content + con item['content'] = content item['pubtime'] = response.xpath( '//span[@class="date"]/text()').extract()[0] imageurl = [] for img in response.xpath( '//div[@class="article-content fontSizeSmall BSHARE_POP"]/p/img/@src' ).extract(): imageurl.append(img) item['cover'] = imageurl[0] item['imageUrl'] = imageurl item['source_url'] = response.url item['source'] = "搜门网" yield item
def parse_detail(self, response): item = ScrapywebItem() item['title'] = response.xpath('//h1/text()').extract()[0] item['summary'] = response.xpath( '//div[@class="infor_detail"]/p/text()').extract()[0] content = '' a_rs = re.compile(r"<a[^>]*>|<\/a>", re.S) for con in response.xpath('//div[@class="infor_detail"]/p').extract(): con = a_rs.sub('', con) #去掉内容的a标签 #con = "<p>" + con + "</p>" content = content + con item['content'] = content item['pubtime'] = '' imageurl = [] domain = 'http://www.chinazns.com' for img in response.xpath( '//div[@class="infor_detail"]/p/img/@src').extract(): item['content'] = item['content'].replace(img, domain + img) #替换html 内的图片 imageurl.append(domain + img) item['cover'] = domain + response.meta['cover'] item['imageUrl'] = imageurl item['source_url'] = response.url item['source'] = "智能锁网" yield item
def parse_detail(self, response): item = ScrapywebItem() item['title'] = response.xpath('//div[@class="main-left fl"]/h1/text()').extract()[0] item['summary'] = response.xpath('//div[@id="digest"]/p/text()').extract()[0] content = '' a_rs = re.compile(r"<a[^>]*>|<\/a>", re.S) #for con in response.xpath('//div[@class="g-main-855 fl"]/div[@class="m-news-bd js-anchor-newsnav"]/div[@class="m-news-box"]/div[@class="m-news-content"]/p').extract(): for con in response.xpath('//div[@id="articleText"]/p').extract(): con = a_rs.sub('',con)#去掉内容的a标签 #con = "<p>" + con + "</p>" content = content + con item['content'] = content item['pubtime'] = '' imageurl = [] for img in response.xpath('//div[@id="articleText"]/p/img/@src').extract(): imageurl.append(img) item['cover'] = imageurl[0] item['imageUrl'] = imageurl item['source_url'] = response.url item['source'] = "新浪家居" yield item
def parse_detail(self, response): item = ScrapywebItem() item['title'] = response.xpath('//h1/text()').extract()[0] item['summary'] = response.xpath( '//div[@class="txtzy"]/text()').extract() content = '' for con in response.xpath('//div[@id="newdcont"]/p').extract(): #con = a_rs.sub('',con)#去掉内容的a标签 content = content + con item['content'] = content item['pubtime'] = '' imageurl = [] for img in response.xpath( '//div[@id="newdcont"]/p/img/@src').extract(): item['content'] = item['content'].replace( img, 'http://www.mumen.com.cn' + img) #替换html 内的图片 imageurl.append('http://www.mumen.com.cn' + img) item['cover'] = 'http://www.mumen.com.cn/' + response.meta['cover'] item['imageUrl'] = imageurl item['source_url'] = response.url item['source'] = "木门网" yield item
def parse_detail(self, response): item = ScrapywebItem() #item['title'] = response.xpath('//div[@class="g-main-855 fl"]/div[@class="m-news-hd"]/h2/text()').extract()[0] # item['summary'] = response.xpath('//div[@class="g-main-855 fl"]/div[@class="m-news-bd js-anchor-newsnav"]/div[@class="m-news-box"]/div[@class="m-news-content"]/p/text()').extract()[0] item['title'] = response.xpath( '//div[@class="post_content_main"]/h1/text()').extract()[0] #item['summary'] = response.xpath('//div[@class="m-news-content"]/p/text()').extract()[0] item['source_url'] = response.url item['source'] = "网易家居" content = '' a_rs = re.compile(r"<a[^>]*>|<\/a>", re.S) #for con in response.xpath('//div[@class="g-main-855 fl"]/div[@class="m-news-bd js-anchor-newsnav"]/div[@class="m-news-box"]/div[@class="m-news-content"]/p').extract(): for con in response.xpath('//div[@class="post_text"]/p').extract(): con = a_rs.sub('', con) #去掉内容的a标签 content = content + con item['content'] = content item['pubtime'] = '' imageurl = [] for img in response.xpath( '//div[@class="post_text"]/p/img/@src').extract(): url_data = urlparse(img) a_url = url_data.scheme + '://' + url_data.netloc + url_data.path imageurl.append(a_url) item['cover'] = imageurl[0] item['imageUrl'] = imageurl item['summary'] = response.meta['summary'] yield item
def parse_detail(self, response): item = ScrapywebItem() item['title'] = response.xpath('//h1[@id="post_title"]/text()').extract()[0] item['summary'] = response.xpath('//div[@id="post_brief"]/text()').extract()[0] item['cover'] = response.meta['cover'] content = '' a_rs = re.compile(r"<a[^>]*>|<\/a>", re.S) for con in response.xpath('//div[@id="post_description"]/p').extract(): con = a_rs.sub('',con)#去掉内容的a标签 content = content + con item['content'] = '<p style="text-align: center;"><img src="'+item['cover']+'" /></p>' + content item['pubtime'] = response.xpath('//div[@id="post_date"]/text()').extract()[0] imageurl = [] #for img in response.xpath('//div[@id="post_description"]/p/img/@src').extract(): imageurl.append(item['cover']) item['imageUrl'] = imageurl item['source_url'] = response.url item['source'] = "亿欧" yield item
def parse_detail(self, response): item = ScrapywebItem() item['title'] = response.xpath('//h1/text()').extract()[0] item['summary'] = response.xpath( '//div[@class="infor_detail"]/p/text()').extract()[0] a_rs = re.compile(r"<a[^>]*>|<\/a>", re.S) link_rs = re.compile( r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+' ) content = '' #for con in response.xpath('//div[@class="g-main-855 fl"]/div[@class="m-news-bd js-anchor-newsnav"]/div[@class="m-news-box"]/div[@class="m-news-content"]/p').extract(): content = response.xpath('//div[@class="infor_detail"]').extract()[0] more_message = response.xpath( '//div[@class="infor_detail"]/div[@class="more_message"]').extract( )[0] content = content.replace(more_message, '') # div 中的 infor_detail content = a_rs.sub('', content) #去掉内容的a标签 item['content'] = content item['pubtime'] = '' imageurl = [] for img in response.xpath( '//div[@class="infor_detail"]/center/img/@src').extract(): item['content'] = item['content'].replace( img, 'http://www.znjj.tv' + img) #替换html 内的图片 imageurl.append('http://www.znjj.tv' + img) for img in response.xpath( '//div[@class="infor_detail"]/p/img/@src').extract(): #imageurl.append(img) item['content'] = item['content'].replace( img, 'http://www.znjj.tv' + img) #替换html 内的图片 imageurl.append('http://www.znjj.tv' + img) item['imageUrl'] = imageurl item['cover'] = 'http://www.znjj.tv' + response.meta['cover'] item['source_url'] = response.url item['source'] = "智家网" yield item
def parse_detail(self, response): item = ScrapywebItem() item['title'] = response.xpath( '//div[@class="detail_body"]/h1[@class="text-left"]/text()' ).extract()[0] a_rs = re.compile(r"<a[^>]*>|<\/a>", re.S) content = '' for con in response.xpath('//div[@class="detail_content"]').extract(): con = a_rs.sub('', con) #去掉内容的a标签 content = content + con item['content'] = content item['pubtime'] = '' imageurl = [] for img in response.xpath( '//div[@class="detail_content"]/p/img/@src').extract(): item['content'] = item['content'].replace( img, 'http://www.27580.cn' + img) #替换html 内的图片 imageurl.append('http://www.27580.cn' + img) #url_data = urlparse(img) #a_url = url_data.scheme+'://'+url_data.netloc+url_data.path #imageurl.append(a_url) for img in response.xpath( '//div[@class="detail_content"]/div/img/@src').extract(): item['content'] = item['content'].replace( img, 'http://www.27580.cn' + img) #替换html 内的图片 imageurl.append('http://www.27580.cn' + img) item['summary'] = response.meta['summary'] item['imageUrl'] = imageurl item['cover'] = imageurl[0] item['source_url'] = response.url item['source'] = "中外涂料网" yield item