def parse_baijiahao_contents(self, response): # 从html中提取出所有的正文内容 content = response.xpath( '/html/body/div[@id = "detail-page"]/div[@id="content-container"]', encoding='UTF-8') content = content.xpath('//span[@class="bjh-p"]/text()', encoding="UTF-8").extract() # 将正文的所有句子连接成字符串 content_string = "" for item in content: if item != "": content_string += item.replace('\xa0', '').replace('\xa9', '') # 去掉正文中的换行符和空格 content_string = content_string.replace('\n', "") content_string = content_string.replace(" ", "") content_string = content_string.replace("\u3000", "") # 将获取到的字符串存入item中 item = MuseumNewsSpiderItem() item['news_id'] = self.getNewsID(response.url) item['main_content'] = content_string self.id += 1 yield item
def parse(self, response): news_body = response.xpath("//div[@class='content_czyd']")[0] news_list = news_body.xpath(".//div[@class='list_list']") for news in news_list: title = news.xpath("./div/dt/a/text()") time = news.xpath("./div/dt/span/text()") content = news.xpath("./div/dd/a/text()") href = news.xpath("./div/dt/a/@href") if len(title) == 0 or len(time) == 0 or len(content) == 0 or len( href) == 0: continue title = title[0].extract() time = time[0].extract() content = content[0].extract() href = prefixURL + href[0].extract() author = "北京鲁迅博物馆" description = "1" tag = 1 item = MuseumNewsSpiderItem() item['title'] = title item['author'] = author item['time'] = time item['description'] = description item['content'] = content item['url'] = href item['tag'] = tag yield item print('page = {}'.format(self.page)) if self.page <= 6: self.page += 1 new_url = URL.format(page=self.page) print(new_url) yield Request(new_url, callback=self.parse, dont_filter=True)
def parse(self, response): news_body = response.xpath("//td[@height='450']")[0] news_list = news_body.xpath(".//table[@width='85%']") for news in news_list: info = news.xpath(".//text()") if len(info) == 0: continue title = info[1].extract() time = info[0].extract().replace("\xa0", "") content = title href = prefixURL + news.xpath(".//@href")[0].extract() author = "首都博物馆" description = "1" tag = 1 item = MuseumNewsSpiderItem() item['title'] = title item['author'] = author item['time'] = time item['description'] = description item['content'] = content item['url'] = href item['tag'] = tag yield item print('page = {}'.format(self.page)) if self.page < 71: self.page += 1 new_url = URL.format(page=self.page) print(new_url) yield Request(new_url, callback=self.parse, dont_filter=True)
def parse(self, response): news_list = response.xpath('//div[@class="result"]') # print(news_list) if not news_list: self.end = True return for news in news_list: href = news.xpath('./h3[@class="c-title"]/a/@href').extract() url = "".join(href).replace("\n", "").replace(" ", "") title = news.xpath('./h3[@class="c-title"]/a/text()').extract() title = "".join(title).replace("\n", "").replace(" ", "") content = news.xpath( './div[@class="c-summary c-row "]/text()').extract() content = "".join(content).replace("\n", "").replace(" ", "") if content == "": content = news.xpath( './div[@class="c-summary c-row "]/div[2]/text()').extract( ) content = "".join(content).replace("\n", "").replace(" ", "") author_time = news.xpath( './div[@class="c-summary c-row "]//p[@class="c-author"]/text()' ).extract() author_time = "".join(author_time).replace("\n", "").replace(" ", "").split() author = "" time = "" if author_time: # 有些新闻没有作者和时间 author = author_time[0] s_time = author_time[1] if s_time: time = self.parse_time(s_time) else: time = s_time description = "1" tag = 1 item = MuseumNewsSpiderItem() item['title'] = title item['author'] = author item['time'] = time item['description'] = description item['content'] = content item['url'] = url item['tag'] = tag yield item print('page = {}'.format(self.page)) if not self.end: self.page += 1 new_url = URL.format(museum=self.museum, bt=self.startTime, et=self.endTime, page=self.page * 10) print(new_url) yield Request(new_url, callback=self.parse, dont_filter=True)
def parse(self, response): url = response.url if "baijiahao" in url: yield scrapy.Request(url, callback=self.parse_baijiahao_contents) elif "new.qq.com" in url: yield scrapy.Request(url, callback=self.parse_tencent_contents) elif "news.ifeng.com" in url: yield scrapy.Request(url, callback=self.parse_ifeng_contents) elif "news.163.com" in url: yield scrapy.Request(url, callback=self.parse_163_news_contents) elif "3g.163.com" in url: yield scrapy.Request(url, callback=self.parse_3g_163_contents) elif "thepaper.cn" in url: yield scrapy.Request(url, callback=self.parse_pengpai_contents) elif "news.sina.com.cn" in url: yield scrapy.Request(url, callback=self.parse_sina_contents) elif "paper.people.com.cn" in url: yield scrapy.Request(url, callback=self.parse_paper_people_contents) elif "xinhuanet.com" in url: yield scrapy.Request(url, callback=self.parse_xinhuanet_contents) elif "bmnh.org.cn" in url: yield scrapy.Request(url, callback=self.parse_bmnh_contents) elif "capitalmuseum" in url: yield scrapy.Request(url, callback=self.parse_capital_museum_contents) elif "cstm.cdstm.cn" in url: yield scrapy.Request(url, callback=self.parse_cstm_contents) elif "luxunmuseum" in url: yield scrapy.Request(url, callback=self.parse_luxunmuseum_contents) elif "jb.mil.cn" in url: yield scrapy.Request(url, callback=self.parse_military_museum_contents) elif "gmc" in url: yield scrapy.Request(url, callback=self.parse_gmc_contents) else: item = MuseumNewsSpiderItem() cursor = self.mydatabase.cursor() sql = "select content from new where url='{}'".format(url) cursor.execute(sql) result = cursor.fetchone() cursor.close() id = self.getNewsID(url) if id != []: item['news_id'] = id else: item['news_id'] = -1 item['main_content'] = result[0] yield item
def parse_xinhuanet_contents(self, response): content = response.xpath('//div[@id="p-detail"]', encoding="UTF-8") content = content.xpath('//p/text()').extract() content_string = "" for item in content: if item != "": content_string += item.replace('\xa0', '').replace('\xa9', '') content_string = content_string.replace('\n', "") content_string = content_string.replace(" ", "") content_string = content_string.replace("\u3000", "") item = MuseumNewsSpiderItem() item['news_id'] = self.getNewsID(response.url) item['main_content'] = content_string yield item
def parse_tencent_contents(self, response): content = response.xpath('/html/body', encoding="UTF-8") content = content.xpath('//p[@class="one-p"]/text()').extract() content_string = "" for item in content: if item != "": content_string += item.replace('\xa0', '').replace('\xa9', '') content_string = content_string.replace('\n', "") content_string = content_string.replace(" ", "") content_string = content_string.replace("\u3000", "") item = MuseumNewsSpiderItem() item['news_id'] = self.getNewsID(response.url) item['main_content'] = content_string yield item
def parse_gmc_contents(self, response): content = response.xpath('//div[@class="article-cont"]') content = content.xpath('//p/span/text()').extract() content_string = "" for item in content: if item != "": content_string += item.replace('\xa0', '').replace('\xa9', '') content_string = content_string.replace('\n', "") content_string = content_string.replace(" ", "") content_string = content_string.replace("\u3000", "") content_string = content_string.replace("\xa0", "").replace("\r", "") item = MuseumNewsSpiderItem() item['news_id'] = self.getNewsID(response.url) item['main_content'] = content_string yield item
def parse_capital_museum_contents(self, response): content = response.xpath("//span[@class='wcontent']") content = content.xpath("//p/text()").extract() content_string = "" for item in content: if item != "": content_string += item.replace('\xa0', '').replace('\xa9', '') content_string = content_string.replace('\n', "") content_string = content_string.replace(" ", "") content_string = content_string.replace("\u3000", "") content_string = content_string.replace("\xa0", "").replace("\r", "") item = MuseumNewsSpiderItem() item['news_id'] = self.getNewsID(response.url) item['main_content'] = content_string yield item
def parse_ifeng_contents(self, response): content = response.xpath("//div[@class='text-3zQ3cZD4']", encoding="UTF-8") content = content.xpath("//p/text()").extract() content_string = "" for item in content: if item != "": content_string += item.replace('\xa0', '').replace('\xa9', '') content_string = content_string.replace('\n', "") content_string = content_string.replace(" ", "") content_string = content_string.replace("\u3000", "") item = MuseumNewsSpiderItem() item['news_id'] = self.getNewsID(response.url) item['main_content'] = content_string yield item
def parse_pengpai_contents(self, response): content = response.xpath( '//*[@id="root"]/div/div[3]/div[1]/div[1]/div[3]/div/div[1]', encoding="UTF-8") content = content[0].xpath('//p/text()').extract() content_string = "" for item in content: if item != "": content_string += item.replace('\xa0', '').replace('\xa9', '') content_string = content_string.replace('\n', "") content_string = content_string.replace(" ", "") content_string = content_string.replace("\u3000", "") item = MuseumNewsSpiderItem() item['news_id'] = self.getNewsID(response.url) item['main_content'] = content_string yield item
def parse_bmnh_contents(self, response): content = response.xpath( '//div[@class="content_singler"]/div[@class="single_block"]', encoding="UTF-8") content = content.xpath('//p') content = content.xpath('//span/text()').extract() content_string = "" for item in content: if item != "": content_string += item.replace('\xa0', '').replace('\xa9', '') content_string = content_string.replace('\n', "") content_string = content_string.replace(" ", "") content_string = content_string.replace("\u3000", "") item = MuseumNewsSpiderItem() item['news_id'] = self.getNewsID(response.url) item['main_content'] = content_string yield item