예제 #1
0
 def parse(self, response):
     for each in response.xpath('//ul[@class="info-list"]/li'):
         item = ProjectSpidersItem()
         item['name'] = each.xpath("./a/text()").extract()[0].strip()
         item['url'] = response.urljoin(
             each.xpath("./a/@href").extract()[0])
         item['department'] = self.name
         item['stime'] = 'None'
         item['speaker'] = 'None'
         item['place'] = 'None'
         for detail in each.xpath('./div/ul/li'):
             if detail.xpath('./strong/text()').extract()[0].find(
                     '时间') != -1:
                 item['stime'] = Tu.unite(
                     detail.xpath('./span/text()').extract()[0])
             if detail.xpath('./strong/text()').extract()[0].find(
                     '地点') != -1:
                 item['place'] = detail.xpath('./span/text()').extract()[0]
             if detail.xpath('./strong/text()').extract()[0].find(
                     '演讲') != -1:
                 item['speaker'] = detail.xpath(
                     './span/text()').extract()[0]
         yield item
     if self.offset < self.MAX_PAGE:
         self.offset += 1
         yield scrapy.Request(self.url + str(self.offset) + '.html',
                              callback=self.parse)
예제 #2
0
 def parse(self, response):
     for each in response.xpath('//div[@class="section"]'):
         item = ProjectSpidersItem()
         item['name'] = each.xpath("./dl/dt/a/text()").extract()[0].replace('\r\n', '').strip(' ')
         item['url'] = response.urljoin(each.xpath("./dl/dt/a/@href").extract()[0])
         item['department'] = self.name
         text_list = each.xpath("./div/text()").extract()
         if len(text_list) > 0:
             item['stime'] = Tu.unite(text_list[0])
         else:
             item['stime'] = 'None'
         text_list = each.xpath("./dl/dd[1]/text()").extract()
         if len(text_list) > 0:
             item['speaker'] = text_list[0]
         else:
             item['speaker'] = 'None'
         text_list = each.xpath("./dl/dd[3]/text()").extract()
         if len(text_list) > 0:
             item['place'] = text_list[0]
         else:
             item['place'] = 'None'
         yield item
     if self.offset < self.MAX_PAGE:
         self.offset += 1
         yield scrapy.Request(self.url + str(self.offset), callback=self.parse)
예제 #3
0
 def parse(self, response):
     for each in response.xpath('//div[@class="newsAcademicListRow"]'):
         item = ProjectSpidersItem()
         item['name'] = each.xpath("./p[1]/a/text()").extract()[0]
         item['url'] = response.urljoin(
             each.xpath("./p[1]/a/@href").extract()[0])
         item['department'] = self.name
         try:
             index = 2
             item['speaker'] = item['place'] = item['stime'] = 'None'
             while True:
                 text = each.xpath("./p[" + str(index) +
                                   "]/text()").extract()[0]
                 if text.find('演讲者') > -1:
                     item['speaker'] = text.replace('演讲者:', '')
                 if text.find('地点') > -1:
                     item['place'] = text.replace('地点:', '')
                 if text.find('时间') > -1:
                     item['stime'] = Tu.unite(text.replace('时间:', ''))
                 index += 1
         except IndexError:
             pass
         yield item
     if self.offset < self.MAX_PAGE:
         self.offset += 1
         yield scrapy.Request(self.url + str(self.offset),
                              callback=self.parse)
예제 #4
0
 def parse(self, response):
     div_list = response.xpath('//div[@class="ch_content_dynamic"]/div')
     div_list.pop(0)
     for each in div_list:
         item = ProjectSpidersItem()
         item['name'] = each.xpath("./a/div[2]/div[1]/text()").extract()[0]
         item['url'] = response.urljoin(each.xpath("./a/@href").extract()[0])
         item['department'] = self.name
         text_list = each.xpath("./a/div[2]/p/span/text()").extract()
         if len(text_list) > 0:
             item['stime'] = Tu.unite(text_list[0])
         else:
             item['stime'] = 'None'
         text_list = each.xpath("./a/div[2]/div[2]/div/span[1]/text()").extract()
         if len(text_list) > 0:
             item['speaker'] = text_list[0]
         else:
             item['speaker'] = 'None'
         text_list = each.xpath("./a/div[2]/div[2]/div/span[3]/text()").extract()
         if len(text_list) > 0:
             item['place'] = text_list[0]
         else:
             item['place'] = 'None'
         yield item
     if self.offset < self.MAX_PAGE:
         self.offset += 1
         yield scrapy.Request(self.url + str(self.offset) + '.html', callback=self.parse)
예제 #5
0
 def parse(self, response):
     rs = json.loads(response.body)
     item = ProjectSpidersItem()
     self.MAX_PAGE = rs['totle']
     for value in rs['data']:
         item['name'] = value['lecture_title']
         item['department'] = self.name
         item['speaker'] = value['lecture_speaker']
         item['stime'] = Tu.unite(value['lecture_addtime'])
         item['place'] = value['lecture_address']
         item['url'] = response.urljoin('/views/details_lecture.html?id=' +
                                        str(value['id']))
         yield item
     if self.offset < self.MAX_PAGE:
         self.offset += 1
         yield scrapy.Request(self.url + str(self.offset) +
                              '&limit=4&language=cn',
                              callback=self.parse)
예제 #6
0
 def parse(self, response):
     for each in response.xpath('//li[@id="xwdt1_li1"]'):
         item = ProjectSpidersItem()
         item['name'] = each.xpath("./a/div[2]/h5/text()").extract()[0].replace('\r\n', '').strip(' ')
         item['url'] = response.urljoin(each.xpath("./a/@href").extract()[0])
         item['department'] = self.name
         item['speaker'] = item['stime'] = item['place'] = 'None'
         detail_list = each.xpath("./a/div[2]/div")
         for detail in detail_list:
             text = detail.xpath("./text()").extract()
             if len(text) > 0:
                 if '演讲' in text[0]:
                     item['speaker'] = text[0][4:]
                 if '地点' in text[0]:
                     item['place'] = text[0][3:]
         detail_list = each.xpath("./a/div[2]/p")
         for detail in detail_list:
             text = detail.xpath("./text()").extract()
             if len(text) == 1:
                 if '演讲' in text[0]:
                     item['speaker'] = text[0][4:]
                 if '地点' in text[0]:
                     item['place'] = text[0][3:]
             if len(text) > 1:
                 for d_text in text:
                     if '演讲' in d_text:
                         item['speaker'] = d_text.strip()[4:]
                     if '地点' in d_text:
                         item['place'] = d_text.strip()[3:]
         detail_list = each.xpath("./a/div[2]/div[1]/p")
         for detail in detail_list:
             text = detail.xpath("./text()").extract()
             if len(text) > 0:
                 if '演讲' in text[0]:
                     item['speaker'] = text[0][4:]
                 if '地点' in text[0]:
                     item['place'] = text[0][3:]
         item['stime'] = Tu.unite(self.get_clear_time(item['url']))
         yield item
     if self.offset < self.MAX_PAGE:
         self.offset += 1
         yield scrapy.Request(self.url + str(self.offset), callback=self.parse)
예제 #7
0
 def parse(self, response):
     for each in response.xpath('//ul[@class="info-list"]/li'):
         item = ProjectSpidersItem()
         item['name'] = each.xpath("./div/h4/a/text()").extract()[0].strip()
         item['url'] = response.urljoin(
             each.xpath("./div/h4/a/@href").extract()[0])
         item['department'] = self.name
         item['speaker'] = each.xpath(
             "./div/div/ul/li[1]/span/text()").extract()[0].replace(
                 '演讲者:', '')
         try:
             item['place'] = each.xpath(
                 "./div/div/ul/li[3]/span/text()").extract()[0].replace(
                     '地点:', '')
         except IndexError:
             item['place'] = 'None'
         item['stime'] = Tu.unite(
             each.xpath("./div/div/ul/li[2]/span/text()").extract()
             [0].replace('时间:', ''))
         yield item
예제 #8
0
 def parse(self, response):
     try:
         self.MAX_PAGE = int(
             response.xpath('//li[@class="last"]/a/@href').extract()
             [0].split('/')[-1])
     except IndexError:
         pass
     for each in response.xpath('//div[@class="main"]'):
         item = ProjectSpidersItem()
         item['name'] = each.xpath("./dl/dt/a/text()").extract()[0]
         item['url'] = response.urljoin(
             each.xpath("./dl/dt/a/@href").extract()[0])
         item['department'] = self.name
         try:
             item['speaker'] = each.xpath(
                 "./dl/dd[@class='name']/text()").extract()[0].replace(
                     '演讲者:', '')
         except IndexError:
             item['speaker'] = 'None'
         try:
             item['place'] = each.xpath(
                 "./dl/dd[@class='place']/text()").extract()[0].replace(
                     '地点:', '')
         except IndexError:
             item['place'] = 'None'
         try:
             item['stime'] = Tu.unite(
                 each.xpath("./dl/dd[@class='date']/text()").extract()
                 [0].replace('时间:', ''))
         except IndexError:
             item['stime'] = 'None'
         yield item
     if self.offset < self.MAX_PAGE:
         self.offset += 1
         yield scrapy.Request(self.url + str(self.offset),
                              callback=self.parse)