def parse_item(self, response): list_title = response.xpath('//td[@class="font01"]/text()').extract() title = "".join(list_title).strip() list_content = response.xpath( '//*[@id="ozoom"]/founder-content/p/text()').extract() content = "".join(list_content).strip() list_date = re.findall('(?<=/)\d{1,}-\d{1,}/\d{1,}(?=/)', response.url) str_date = "".join(list_date) date = str_date.replace('/', '-') list_page = response.xpath( '//*[@id="logoTable"]/tr/td[1]/table/tr[1]/td/table[2]/tr/td[2]/text()' ).extract() str_page = "".join(list_page) page = re.findall('\w{1,}', str_page, re.A)[0] list_category = response.xpath( '//*[@id="logoTable"]/tr/td[1]/table/tr[1]/td/table[2]/tr/td[2]/strong/text()' ).extract() category = "".join(list_category) if content == "": pass else: item = newsItem() item['title'] = title item['page'] = page item['content'] = content item['date'] = date item['category'] = category item['url'] = response.url item['newspapers'] = self.newspapers yield item
def parse_item(self, response): list_title = response.xpath( '//div[@class="bmnr_con_biaoti"]/text()').extract() title = "".join(list_title).strip() list_content = response.xpath( '//*[@id="ozoom"]/founder-content/p/text()').extract() content = "".join(list_content).strip() list_date = re.findall('(?<=/)\d{1,}-\d{1,}/\d{1,}(?=/)', response.url) str_date = "".join(list_date) date = str_date.replace('/', '-') list_page = response.xpath('//div[@class="b_bot"]/text()').extract() str_page = "".join(list_page) page = re.findall('\w{1,}', str_page, re.A)[0] category = response.meta['category'] if content == "": pass else: item = newsItem() item['title'] = title item['page'] = page item['content'] = content item['date'] = date item['category'] = category item['url'] = response.url item['newspapers'] = self.newspapers yield item
def parse_item(self, response): list_title = response.xpath( '//p[@class="articleTitle"]/text()').extract() title = "".join(list_title) list_page = response.xpath( '//div[@class="leftTitle"]/text()').extract() str_page = "".join(list_page) page = str_page.split(":")[1] list_content = response.xpath( '//div[@class="articleContent"]/p/text()').extract() content = "".join(list_content) list_date = re.findall('(?<=/)\d{1,}/\d{1,}/\d{1,}(?=/)', response.url) str_date = "".join(list_date) date = str_date.replace('/', '-') category = 'null' if content == "": pass else: item = newsItem() item['title'] = title item['page'] = page item['content'] = content item['date'] = date item['category'] = category item['url'] = response.url item['newspapers'] = self.newspapers yield item
def parse_item(self, response): list_title = response.xpath( '//p[@class="BSHARE_TEXT"]/text()').extract() title = "".join(list_title) list_page = response.xpath( '//*[@id="currentBM"]/strong/text()').extract() str_page = "".join(list_page) page = re.findall('\d{1,}', str_page)[0] list_content = response.xpath( '//*[@id="ozoom"]/founder-content/text()|//*[@id="ozoom"]/founder-content/p/text()' ).extract() content = "".join(list_content) list_date = re.findall('(?<=/)\d{1,}-\d{1,}/\d{1,}(?=/)', response.url) str_date = "".join(list_date) date = str_date.replace('/', '-') category = response.meta['category'] if content == "": pass else: item = newsItem() item['title'] = title item['page'] = page item['content'] = content item['date'] = date item['category'] = category item['url'] = response.url item['newspapers'] = self.newspapers yield item
def parse_item(self, response): list_title = response.xpath( '//div[@class="newsdetatit"]/h3/text()').extract() title = "".join(list_title).strip() list_content = response.xpath( '//div[@class="newsdetatext"]/founder-content/p/text()').extract() content = "".join(list_content).strip() list_date = re.findall('(?<=/)\d{1,}/\d{1,}(?=/)', response.url) str_date = "".join(list_date) n_date = str_date.replace('/', '-') date = n_date[:4] + '-' + n_date[4:] list_page = response.xpath( '//div[@class="newsdetatit"]/p[3]/span[@class="Author"]/text()' ).extract() str_page = "".join(list_page) page = str_page.split(':')[1] if content == "": pass else: item = newsItem() item['title'] = title item['page'] = page item['content'] = content item['date'] = date item['category'] = response.meta['category'] item['url'] = response.url item['newspapers'] = self.newspapers yield item
def parse_item(self, response): list_title = response.xpath( '//div[@class="detailtitle"]/text()').extract() title = "".join(list_title).strip() list_content = response.xpath( '//*[@id="contenttext"]/text()').extract() content = "".join(list_content).strip() list_date = re.findall('(?<=/)\d{1,}(?=/)', response.url) str_date = "".join(list_date) date = str_date[:4] + '-' + str_date[4:6] + '-' + str_date[6:8] list_page_category = response.xpath( '/html/body/div[1]/table[1]/tbody/tr/td[2]/table[2]/tbody/tr[1]/td/div/strong[1]/font/text()' ).extract() str_page_category = "".join(list_page_category) page = re.findall('\w{1,}', str_page_category, re.A)[0] category = str_page_category.split(':')[1] if content == "": pass else: item = newsItem() item['title'] = title item['page'] = page item['content'] = content item['date'] = date item['category'] = category item['url'] = response.url item['newspapers'] = self.newspapers yield item
def parse_item(self, response): list_title = response.xpath( '//*[@id="content"]/div/h1/text()').extract() title = "".join(list_title) list_content = response.xpath( '//*[@id="content_div"]/p/font/text()').extract() content = "".join(list_content) list_date = re.findall('(?<=/)\d{1,}(?=/)', response.url) str_date = "".join(list_date) date = str_date[:4] + '-' + str_date[4:6] + '-' + str_date[6:8] list_page_category = response.xpath( '//*[@id="content"]/div/p[1]/text()[2]').extract() str_page_category = "".join(list_page_category) list_page = re.findall('\w{1,}', str_page_category, re.A) page = "".join(list_page) list_category = re.findall('(?<=:).*', str_page_category) category = "".join(list_category) if content == "": pass else: item = newsItem() item['title'] = title item['page'] = page item['content'] = content item['date'] = date item['category'] = category item['url'] = response.url item['newspapers'] = self.newspapers yield item
def parse_item(self, response): list_title = response.xpath( '//div[@class="content"]/h2/text()').extract() title = "".join(list_title) list_page = re.findall('(?<=/)[A-Z]\d{1,}(?=/)', response.url) page = "".join(list_page) list_content = response.xpath( '//div[@class="cnt-main"]/p/text()').extract() content = "".join(list_content) list_date = re.findall('(?<=/)\d{1,}(?=/)', response.url) str_date = "".join(list_date) date = str_date[:4] + '-' + str_date[4:6] + '-' + str_date[6:8] list_category = response.xpath( '//div[@class="info"]/span[2]/a/text()').extract() category = "".join(list_category) if content == "": pass else: item = newsItem() item['title'] = title item['page'] = page item['content'] = content item['date'] = date item['category'] = category item['url'] = response.url item['newspapers'] = self.newspapers yield item
def parse_item(self, response): list_title = response.xpath('//*[@class="newsdetail_bg clearfix"]/h1/text()').extract() title = "".join(list_title) list_content = response.xpath('//*[@class="newsdetail_bg clearfix"]/div[@class="content"]/p/span/text()').extract() content = "".join(list_content) list_date = re.findall('(?<=/)\d{1,}-\d{1,}/\d{1,}(?=/)', response.url) str_date = "".join(list_date) date = str_date.replace('/', '-') page = response.meta['page'] category = response.meta['category'] if content == "": pass else: item = newsItem() item['title'] = title item['page'] = page item['content'] = content item['date'] = date item['category'] = category item['url'] = response.url item['newspapers'] = self.newspapers yield item
def parse_item(self, response): list_title = response.xpath('//td[@class="font01"]/text()').extract() title = "".join(list_title).strip() list_content = response.xpath( '//*[@id="ozoom"]/span/p/text()').extract() content = "".join(list_content).strip() list_date = re.findall('(?<=/)\d{1,}-\d{1,}/\d{1,}(?=/)', response.url) str_date = "".join(list_date) date = str_date.replace('/', '-') page = response.meta['page'] category = response.meta['category'] if content == "": pass else: item = newsItem() item['title'] = title item['page'] = page item['content'] = content item['date'] = date item['category'] = category item['url'] = response.url item['newspapers'] = self.newspapers yield item
def parse_item(self, response): list_title = response.xpath('//div[@id="main"]/h1/text()').extract() title = "".join(list_title) list_content = response.xpath('//div[@id="main"]/p/text()').extract() content = "".join(list_content) date = re.findall('\d{1,}-\d{1,}-\d{1,}', response.url)[0] list_page = response.xpath( '//div[@id="main"]/div/ul/li[1]/a/text()').extract() str_page = "".join(list_page) page = re.findall('\w{1,}', str_page, re.A)[0] category = 'null' if content == "": pass else: item = newsItem() item['title'] = title item['page'] = page item['content'] = content item['date'] = date item['category'] = category item['url'] = response.url item['newspapers'] = self.newspapers yield item
def parse_item(self, response): list_title = response.xpath('//td[@class="jyrb05"]/text()').extract() title = "".join(list_title).strip() list_content = response.xpath( '//td[@class="jyrb07"]/p/text()').extract() content = "".join(list_content).strip() list_date = re.findall('(?<=/)\d{1,}(?=/)', response.url) str_date = "".join(list_date) date = str_date[:4] + '-' + str_date[4:6] + '-' + str_date[6:8] page = 'null' category = 'null' if content == "": pass else: item = newsItem() item['title'] = title item['page'] = page item['content'] = content item['date'] = date item['category'] = category item['url'] = response.url item['newspapers'] = self.newspapers yield item
def parse_item(self, response): list_title = response.xpath('//div[@class="title1"]/h1/text()').extract() title = "".join(list_title).strip() list_content = response.xpath('//div[@class="content"]/text()').extract() content = "".join(list_content).strip() list_date = re.findall('(?<=/)\d{1,}-\d{1,}/\d{1,}(?=/)', response.url) str_date = "".join(list_date) date = str_date.replace('/','-') list_page_category = response.xpath('//div[@class="next"]/ul/li[1]/text()').extract() str_page_category = "".join(list_page_category) page = re.findall('\w{1,}', str_page_category, re.A)[0] category = str_page_category.split(':')[1] if content == "": pass else: item = newsItem() item['title'] = title item['page'] = page item['content'] = content item['date'] = date item['category'] = category item['url'] = response.url item['newspapers'] = self.newspapers yield item