def parse(self, response): for each in response.xpath('//ul[@class="info-list"]/li'): item = ProjectSpidersItem() item['name'] = each.xpath("./a/text()").extract()[0].strip() item['url'] = response.urljoin( each.xpath("./a/@href").extract()[0]) item['department'] = self.name item['stime'] = 'None' item['speaker'] = 'None' item['place'] = 'None' for detail in each.xpath('./div/ul/li'): if detail.xpath('./strong/text()').extract()[0].find( '时间') != -1: item['stime'] = Tu.unite( detail.xpath('./span/text()').extract()[0]) if detail.xpath('./strong/text()').extract()[0].find( '地点') != -1: item['place'] = detail.xpath('./span/text()').extract()[0] if detail.xpath('./strong/text()').extract()[0].find( '演讲') != -1: item['speaker'] = detail.xpath( './span/text()').extract()[0] yield item if self.offset < self.MAX_PAGE: self.offset += 1 yield scrapy.Request(self.url + str(self.offset) + '.html', callback=self.parse)
def parse(self, response): for each in response.xpath('//div[@class="section"]'): item = ProjectSpidersItem() item['name'] = each.xpath("./dl/dt/a/text()").extract()[0].replace('\r\n', '').strip(' ') item['url'] = response.urljoin(each.xpath("./dl/dt/a/@href").extract()[0]) item['department'] = self.name text_list = each.xpath("./div/text()").extract() if len(text_list) > 0: item['stime'] = Tu.unite(text_list[0]) else: item['stime'] = 'None' text_list = each.xpath("./dl/dd[1]/text()").extract() if len(text_list) > 0: item['speaker'] = text_list[0] else: item['speaker'] = 'None' text_list = each.xpath("./dl/dd[3]/text()").extract() if len(text_list) > 0: item['place'] = text_list[0] else: item['place'] = 'None' yield item if self.offset < self.MAX_PAGE: self.offset += 1 yield scrapy.Request(self.url + str(self.offset), callback=self.parse)
def parse(self, response): for each in response.xpath('//div[@class="newsAcademicListRow"]'): item = ProjectSpidersItem() item['name'] = each.xpath("./p[1]/a/text()").extract()[0] item['url'] = response.urljoin( each.xpath("./p[1]/a/@href").extract()[0]) item['department'] = self.name try: index = 2 item['speaker'] = item['place'] = item['stime'] = 'None' while True: text = each.xpath("./p[" + str(index) + "]/text()").extract()[0] if text.find('演讲者') > -1: item['speaker'] = text.replace('演讲者:', '') if text.find('地点') > -1: item['place'] = text.replace('地点:', '') if text.find('时间') > -1: item['stime'] = Tu.unite(text.replace('时间:', '')) index += 1 except IndexError: pass yield item if self.offset < self.MAX_PAGE: self.offset += 1 yield scrapy.Request(self.url + str(self.offset), callback=self.parse)
def parse(self, response): div_list = response.xpath('//div[@class="ch_content_dynamic"]/div') div_list.pop(0) for each in div_list: item = ProjectSpidersItem() item['name'] = each.xpath("./a/div[2]/div[1]/text()").extract()[0] item['url'] = response.urljoin(each.xpath("./a/@href").extract()[0]) item['department'] = self.name text_list = each.xpath("./a/div[2]/p/span/text()").extract() if len(text_list) > 0: item['stime'] = Tu.unite(text_list[0]) else: item['stime'] = 'None' text_list = each.xpath("./a/div[2]/div[2]/div/span[1]/text()").extract() if len(text_list) > 0: item['speaker'] = text_list[0] else: item['speaker'] = 'None' text_list = each.xpath("./a/div[2]/div[2]/div/span[3]/text()").extract() if len(text_list) > 0: item['place'] = text_list[0] else: item['place'] = 'None' yield item if self.offset < self.MAX_PAGE: self.offset += 1 yield scrapy.Request(self.url + str(self.offset) + '.html', callback=self.parse)
def parse(self, response): rs = json.loads(response.body) item = ProjectSpidersItem() self.MAX_PAGE = rs['totle'] for value in rs['data']: item['name'] = value['lecture_title'] item['department'] = self.name item['speaker'] = value['lecture_speaker'] item['stime'] = Tu.unite(value['lecture_addtime']) item['place'] = value['lecture_address'] item['url'] = response.urljoin('/views/details_lecture.html?id=' + str(value['id'])) yield item if self.offset < self.MAX_PAGE: self.offset += 1 yield scrapy.Request(self.url + str(self.offset) + '&limit=4&language=cn', callback=self.parse)
def parse(self, response): for each in response.xpath('//li[@id="xwdt1_li1"]'): item = ProjectSpidersItem() item['name'] = each.xpath("./a/div[2]/h5/text()").extract()[0].replace('\r\n', '').strip(' ') item['url'] = response.urljoin(each.xpath("./a/@href").extract()[0]) item['department'] = self.name item['speaker'] = item['stime'] = item['place'] = 'None' detail_list = each.xpath("./a/div[2]/div") for detail in detail_list: text = detail.xpath("./text()").extract() if len(text) > 0: if '演讲' in text[0]: item['speaker'] = text[0][4:] if '地点' in text[0]: item['place'] = text[0][3:] detail_list = each.xpath("./a/div[2]/p") for detail in detail_list: text = detail.xpath("./text()").extract() if len(text) == 1: if '演讲' in text[0]: item['speaker'] = text[0][4:] if '地点' in text[0]: item['place'] = text[0][3:] if len(text) > 1: for d_text in text: if '演讲' in d_text: item['speaker'] = d_text.strip()[4:] if '地点' in d_text: item['place'] = d_text.strip()[3:] detail_list = each.xpath("./a/div[2]/div[1]/p") for detail in detail_list: text = detail.xpath("./text()").extract() if len(text) > 0: if '演讲' in text[0]: item['speaker'] = text[0][4:] if '地点' in text[0]: item['place'] = text[0][3:] item['stime'] = Tu.unite(self.get_clear_time(item['url'])) yield item if self.offset < self.MAX_PAGE: self.offset += 1 yield scrapy.Request(self.url + str(self.offset), callback=self.parse)
def parse(self, response): for each in response.xpath('//ul[@class="info-list"]/li'): item = ProjectSpidersItem() item['name'] = each.xpath("./div/h4/a/text()").extract()[0].strip() item['url'] = response.urljoin( each.xpath("./div/h4/a/@href").extract()[0]) item['department'] = self.name item['speaker'] = each.xpath( "./div/div/ul/li[1]/span/text()").extract()[0].replace( '演讲者:', '') try: item['place'] = each.xpath( "./div/div/ul/li[3]/span/text()").extract()[0].replace( '地点:', '') except IndexError: item['place'] = 'None' item['stime'] = Tu.unite( each.xpath("./div/div/ul/li[2]/span/text()").extract() [0].replace('时间:', '')) yield item
def parse(self, response): try: self.MAX_PAGE = int( response.xpath('//li[@class="last"]/a/@href').extract() [0].split('/')[-1]) except IndexError: pass for each in response.xpath('//div[@class="main"]'): item = ProjectSpidersItem() item['name'] = each.xpath("./dl/dt/a/text()").extract()[0] item['url'] = response.urljoin( each.xpath("./dl/dt/a/@href").extract()[0]) item['department'] = self.name try: item['speaker'] = each.xpath( "./dl/dd[@class='name']/text()").extract()[0].replace( '演讲者:', '') except IndexError: item['speaker'] = 'None' try: item['place'] = each.xpath( "./dl/dd[@class='place']/text()").extract()[0].replace( '地点:', '') except IndexError: item['place'] = 'None' try: item['stime'] = Tu.unite( each.xpath("./dl/dd[@class='date']/text()").extract() [0].replace('时间:', '')) except IndexError: item['stime'] = 'None' yield item if self.offset < self.MAX_PAGE: self.offset += 1 yield scrapy.Request(self.url + str(self.offset), callback=self.parse)