Пример #1
0
 def parse(self, response):
     for each in response.xpath('//div[@class="div_conter"]'):
         item = ProjectSpidersItem()
         item['name'] = each.xpath("./p/a/@title").extract()[0].strip()
         item['url'] = response.urljoin(
             each.xpath("./p/a/@href").extract()[0])
         item['department'] = self.name
         detail = each.xpath("./div/div/p/text()").extract()
         if len(detail) == 1:
             detail = detail[0].strip().replace(' ', '').replace(
                 ' ', '').replace('\xa0', '')
             detail = detail.split('\n')
             for it in detail:
                 it = it.strip()
                 if it.find('时间') != -1:
                     if self.pas_month < int(it[3:it.find('月')]):
                         self.re_year -= 1
                     item['stime'] = Tu.bio(it[3:].strip(), self.re_year)
                     self.pas_month = int(item['stime'].split('-')[1])
                 if it.find('地点') != -1:
                     item['place'] = it[3:].strip()
                 if it.find('报告') != -1:
                     item['speaker'] = it[4:].strip()
         if len(detail) > 1:
             item['speaker'] = 'None'
             item['place'] = 'None'
             item['stime'] = 'None'
             for it in detail:
                 it = it.strip().replace(' ', '').replace(' ', '').replace(
                     '\xa0', '')
                 if it.find('年') != -1:
                     continue
                 if it.find('时间') != -1:
                     if it.find('月') != -1:
                         if self.pas_month < int(it[3:it.find('月')]):
                             self.re_year -= 1
                         item['stime'] = Tu.bio(it[3:].strip(),
                                                self.re_year)
                         self.pas_month = int(item['stime'].split('-')[1])
                 if it.find('地点') != -1:
                     item['place'] = it[3:].strip()
                 if it.find('报告') != -1:
                     item['speaker'] = it[4:].strip()
                 if it.find('演讲') != -1:
                     item['speaker'] = it[4:].strip()
         yield item
     if self.offset < self.MAX_PAGE:
         self.offset += 1
         yield scrapy.Request(self.url + str(self.offset),
                              callback=self.parse)
Пример #2
0
 def parse(self, response):
     for each in response.xpath('//ul[@class="info-list"]/li'):
         item = ProjectSpidersItem()
         item['name'] = each.xpath("./a/text()").extract()[0].strip()
         item['url'] = response.urljoin(
             each.xpath("./a/@href").extract()[0])
         item['department'] = self.name
         item['stime'] = 'None'
         item['speaker'] = 'None'
         item['place'] = 'None'
         for detail in each.xpath('./div/ul/li'):
             if detail.xpath('./strong/text()').extract()[0].find(
                     '时间') != -1:
                 item['stime'] = Tu.unite(
                     detail.xpath('./span/text()').extract()[0])
             if detail.xpath('./strong/text()').extract()[0].find(
                     '地点') != -1:
                 item['place'] = detail.xpath('./span/text()').extract()[0]
             if detail.xpath('./strong/text()').extract()[0].find(
                     '演讲') != -1:
                 item['speaker'] = detail.xpath(
                     './span/text()').extract()[0]
         yield item
     if self.offset < self.MAX_PAGE:
         self.offset += 1
         yield scrapy.Request(self.url + str(self.offset) + '.html',
                              callback=self.parse)
Пример #3
0
 def parse(self, response):
     for each in response.xpath('//div[@class="newsAcademicListRow"]'):
         item = ProjectSpidersItem()
         item['name'] = each.xpath("./p[1]/a/text()").extract()[0]
         item['url'] = response.urljoin(
             each.xpath("./p[1]/a/@href").extract()[0])
         item['department'] = self.name
         try:
             index = 2
             item['speaker'] = item['place'] = item['stime'] = 'None'
             while True:
                 text = each.xpath("./p[" + str(index) +
                                   "]/text()").extract()[0]
                 if text.find('演讲者') > -1:
                     item['speaker'] = text.replace('演讲者:', '')
                 if text.find('地点') > -1:
                     item['place'] = text.replace('地点:', '')
                 if text.find('时间') > -1:
                     item['stime'] = Tu.unite(text.replace('时间:', ''))
                 index += 1
         except IndexError:
             pass
         yield item
     if self.offset < self.MAX_PAGE:
         self.offset += 1
         yield scrapy.Request(self.url + str(self.offset),
                              callback=self.parse)
Пример #4
0
 def parse(self, response):
     for each in response.xpath('//div[@class="section"]'):
         item = ProjectSpidersItem()
         item['name'] = each.xpath("./dl/dt/a/text()").extract()[0].replace('\r\n', '').strip(' ')
         item['url'] = response.urljoin(each.xpath("./dl/dt/a/@href").extract()[0])
         item['department'] = self.name
         text_list = each.xpath("./div/text()").extract()
         if len(text_list) > 0:
             item['stime'] = Tu.unite(text_list[0])
         else:
             item['stime'] = 'None'
         text_list = each.xpath("./dl/dd[1]/text()").extract()
         if len(text_list) > 0:
             item['speaker'] = text_list[0]
         else:
             item['speaker'] = 'None'
         text_list = each.xpath("./dl/dd[3]/text()").extract()
         if len(text_list) > 0:
             item['place'] = text_list[0]
         else:
             item['place'] = 'None'
         yield item
     if self.offset < self.MAX_PAGE:
         self.offset += 1
         yield scrapy.Request(self.url + str(self.offset), callback=self.parse)
Пример #5
0
 def parse(self, response):
     for each in response.xpath('/html/body/div[3]/div[2]/div[2]/div/ul/li'):
         item = ProjectSpidersItem()
         item['name'] = each.xpath("./div[2]/h3/a/text()").extract()[0]
         item['url'] = response.urljoin(each.xpath("./div[2]/h3/a/@href").extract()[0])
         item['department'] = self.name
         detail = each.xpath("./div[2]/p/text()").extract()[0]
         if detail.find("演讲者") > -1:
             item['speaker'] = detail[detail.find("演讲者") + 4:]
         else:
             item['speaker'] = 'None'
         if detail.find("时间") > -1:
             stime = None
             if detail.find("日", 0, 15) > -1:
                 stime = detail[detail.find("时间") + 3:detail.find("日")+1]
             if detail.find("号", 0, 15) > -1:
                 stime = detail[detail.find("时间") + 3:detail.find("号")+1]
             item['stime'] = Tu.ess(stime)
         else:
             item['stime'] = 'None'
         if detail.find("地点") > -1:
             item['place'] = detail[detail.find("地点") + 3:detail.find("演讲者") - 3]
         else:
             item['place'] = 'None'
         yield item
     if self.offset < self.MAX_PAGE:
         self.offset += 1
         yield scrapy.Request(self.url + str(self.offset) + '.html', callback=self.parse)
Пример #6
0
 def parse(self, response):
     div_list = response.xpath('//div[@class="ch_content_dynamic"]/div')
     div_list.pop(0)
     for each in div_list:
         item = ProjectSpidersItem()
         item['name'] = each.xpath("./a/div[2]/div[1]/text()").extract()[0]
         item['url'] = response.urljoin(each.xpath("./a/@href").extract()[0])
         item['department'] = self.name
         text_list = each.xpath("./a/div[2]/p/span/text()").extract()
         if len(text_list) > 0:
             item['stime'] = Tu.unite(text_list[0])
         else:
             item['stime'] = 'None'
         text_list = each.xpath("./a/div[2]/div[2]/div/span[1]/text()").extract()
         if len(text_list) > 0:
             item['speaker'] = text_list[0]
         else:
             item['speaker'] = 'None'
         text_list = each.xpath("./a/div[2]/div[2]/div/span[3]/text()").extract()
         if len(text_list) > 0:
             item['place'] = text_list[0]
         else:
             item['place'] = 'None'
         yield item
     if self.offset < self.MAX_PAGE:
         self.offset += 1
         yield scrapy.Request(self.url + str(self.offset) + '.html', callback=self.parse)
Пример #7
0
 def parse(self, response):
     for each in response.xpath('/html/body/div[5]/div/div/ul/li'):
         item = ProjectSpidersItem()
         item['department'] = self.name
         item['name'] = each.xpath('./div/h4/a/text()').extract()[0]
         item['url'] = response.urljoin(
             each.xpath('./div/h4/a/@href').extract()[0])
         publish_year = each.xpath('./div/span/text()').extract()[0]
         publish_day = each.xpath('./div/span/text()').extract()[1].replace(
             '/', '-')
         try:
             index = 1
             item['speaker'] = item['place'] = item['stime'] = 'None'
             while True:
                 text = each.xpath('./div/div/span[' + str(index) +
                                   ']/text()').extract()[0]
                 if text.find('演讲者') > -1:
                     item['speaker'] = text.replace('演讲者:', '')
                 if text.find('地点') > -1:
                     item['place'] = text.replace('地点:', '')
                 if text.find('时间') > -1:
                     item['stime'] = text.replace('时间:', '')
                     old_time = text.replace('时间:', '')
                     item['stime'] = Tu.physic(
                         old_time, publish_year + '-' + publish_day)
                 index += 1
         except IndexError:
             pass
         yield item
     if self.offset < self.MAX_PAGE:
         self.offset += 1
         yield scrapy.Request(self.url + str(self.offset) + '.html',
                              callback=self.parse)
Пример #8
0
 def parse(self, response):
     for each in response.xpath('//ul[@class="lec-ul"]/li'):
         item = ProjectSpidersItem()
         item['name'] = each.xpath("./a/div/p[1]/text()").extract()[0].strip()
         item['url'] = response.urljoin(each.xpath("./a/@href").extract()[0])
         item['department'] = self.name
         item['speaker'] = each.xpath("./a/div/p[2]/span[1]/text()").extract()[0].replace('演讲者:', '')
         item['place'] = each.xpath("./a/div/p[2]/span[3]/text()").extract()[0].replace('地点:', '')
         item['stime'] = Tu.ese(each.xpath("./a/div/p[2]/span[2]/text()").extract()[0].replace('时间:', ''))
         yield item
     if self.offset < self.MAX_PAGE:
         self.offset += 1
         yield scrapy.Request(self.url + str(self.offset), callback=self.parse)
Пример #9
0
 def parse(self, response):
     rs = json.loads(response.body)
     item = ProjectSpidersItem()
     self.MAX_PAGE = rs['totle']
     for value in rs['data']:
         item['name'] = value['lecture_title']
         item['department'] = self.name
         item['speaker'] = value['lecture_speaker']
         item['stime'] = Tu.unite(value['lecture_addtime'])
         item['place'] = value['lecture_address']
         item['url'] = response.urljoin('/views/details_lecture.html?id=' +
                                        str(value['id']))
         yield item
     if self.offset < self.MAX_PAGE:
         self.offset += 1
         yield scrapy.Request(self.url + str(self.offset) +
                              '&limit=4&language=cn',
                              callback=self.parse)
Пример #10
0
 def parse(self, response):
     for each in response.xpath('//div[@class="nnews3_lb3_m"]'):
         item = ProjectSpidersItem()
         item['name'] = each.xpath('./div[1]/text()').extract()[0].strip()
         item['url'] = 'http://fin.sustech.edu.cn/news3.aspx?TypeId=126&FId=t2:126:2'
         item['department'] = self.name
         try:
             item['speaker'] = each.xpath('./div[2]/div[1]/text()').extract()[1].strip()
             if item['speaker'] == '\\' or item['speaker'] == '':
                 continue
         except IndexError:
             continue
         try:
             item['place'] = each.xpath('./div[2]/div[3]/span[2]/text()').extract()[0].strip()
         except IndexError:
             item['place'] = 'None'
         item['stime'] = Tu.fin(each.xpath('./div[2]/div[2]/text()').extract()[1].strip())
         yield item
Пример #11
0
 def parse(self, response):
     for each in response.xpath('//li[@id="xwdt1_li1"]'):
         item = ProjectSpidersItem()
         item['name'] = each.xpath("./a/div[2]/h5/text()").extract()[0].replace('\r\n', '').strip(' ')
         item['url'] = response.urljoin(each.xpath("./a/@href").extract()[0])
         item['department'] = self.name
         item['speaker'] = item['stime'] = item['place'] = 'None'
         detail_list = each.xpath("./a/div[2]/div")
         for detail in detail_list:
             text = detail.xpath("./text()").extract()
             if len(text) > 0:
                 if '演讲' in text[0]:
                     item['speaker'] = text[0][4:]
                 if '地点' in text[0]:
                     item['place'] = text[0][3:]
         detail_list = each.xpath("./a/div[2]/p")
         for detail in detail_list:
             text = detail.xpath("./text()").extract()
             if len(text) == 1:
                 if '演讲' in text[0]:
                     item['speaker'] = text[0][4:]
                 if '地点' in text[0]:
                     item['place'] = text[0][3:]
             if len(text) > 1:
                 for d_text in text:
                     if '演讲' in d_text:
                         item['speaker'] = d_text.strip()[4:]
                     if '地点' in d_text:
                         item['place'] = d_text.strip()[3:]
         detail_list = each.xpath("./a/div[2]/div[1]/p")
         for detail in detail_list:
             text = detail.xpath("./text()").extract()
             if len(text) > 0:
                 if '演讲' in text[0]:
                     item['speaker'] = text[0][4:]
                 if '地点' in text[0]:
                     item['place'] = text[0][3:]
         item['stime'] = Tu.unite(self.get_clear_time(item['url']))
         yield item
     if self.offset < self.MAX_PAGE:
         self.offset += 1
         yield scrapy.Request(self.url + str(self.offset), callback=self.parse)
Пример #12
0
 def parse(self, response):
     for each in response.xpath('//ul[@class="info-list"]/li'):
         item = ProjectSpidersItem()
         item['name'] = each.xpath("./div/h4/a/text()").extract()[0].strip()
         item['url'] = response.urljoin(
             each.xpath("./div/h4/a/@href").extract()[0])
         item['department'] = self.name
         item['speaker'] = each.xpath(
             "./div/div/ul/li[1]/span/text()").extract()[0].replace(
                 '演讲者:', '')
         try:
             item['place'] = each.xpath(
                 "./div/div/ul/li[3]/span/text()").extract()[0].replace(
                     '地点:', '')
         except IndexError:
             item['place'] = 'None'
         item['stime'] = Tu.unite(
             each.xpath("./div/div/ul/li[2]/span/text()").extract()
             [0].replace('时间:', ''))
         yield item
Пример #13
0
 def parse(self, response):
     for each in response.xpath('//li[@class="data-item "]'):
         item = ProjectSpidersItem()
         item['name'] = each.xpath("./a/span[2]/span[2]/span[1]/span/text()") \
             .extract()[0].strip().replace('Lecture:', '')
         item['url'] = response.urljoin(
             each.xpath("./a/@href").extract()[0])
         item['department'] = self.name
         item['speaker'] = each.xpath("./a/span[2]/span[2]/span[2]/span[1]/text()") \
             .extract()[0].replace('Speaker:', '')
         try:
             item['place'] = each.xpath("./a/span[2]/span[2]/span[2]/span[3]/text()") \
                 .extract()[0].replace('Location:', '')
         except IndexError:
             item['place'] = 'None'
         item['stime'] = Tu.med(
             each.xpath("./a/span[2]/span[2]/span[2]/span[2]/text()").
             extract()[0].replace('Time:', ''))
         yield item
     if len(response.xpath('//li[@class="data-item "]')) > 0:
         self.offset += 1
         yield scrapy.Request(self.url + str(self.offset),
                              callback=self.parse)
Пример #14
0
 def parse(self, response):
     for each in response.xpath('//div[@class="ny_tzgg clearfix"]/div'):
         item = ProjectSpidersItem()
         item['name'] = each.xpath("./a/div[2]/h3/text()").extract()[0].strip()
         item['url'] = response.urljoin(each.xpath("./a/@href").extract()[0])
         item['department'] = self.name
         year = each.xpath("./a/div[1]/div[2]/text()").extract()[1]
         item['stime'] = Tu.bme(each.xpath('./a/div[2]/p[1]/text()').extract()[0], year)
         item['place'] = each.xpath('./a/div[2]/p[2]/text()').extract()[0]
         try:
             item['speaker'] = each.xpath('./a/div[2]/p[3]/span/text()').extract()[0]
         except IndexError:
             if item['name'].find(':') != -1:
                 item['speaker'] = item['name'].split(':')[0]
             else:
                 if item['name'].find(':') != -1:
                     item['speaker'] = item['name'].split(':')[0]
                 else:
                     item['speaker'] = 'None'
         yield item
     if self.offset < self.MAX_PAGE:
         self.offset += 1
         yield scrapy.Request(self.url + str(self.offset), callback=self.parse)
Пример #15
0
 def parse(self, response):
     try:
         self.MAX_PAGE = int(
             response.xpath('//li[@class="last"]/a/@href').extract()
             [0].split('/')[-1])
     except IndexError:
         pass
     for each in response.xpath('//div[@class="main"]'):
         item = ProjectSpidersItem()
         item['name'] = each.xpath("./dl/dt/a/text()").extract()[0]
         item['url'] = response.urljoin(
             each.xpath("./dl/dt/a/@href").extract()[0])
         item['department'] = self.name
         try:
             item['speaker'] = each.xpath(
                 "./dl/dd[@class='name']/text()").extract()[0].replace(
                     '演讲者:', '')
         except IndexError:
             item['speaker'] = 'None'
         try:
             item['place'] = each.xpath(
                 "./dl/dd[@class='place']/text()").extract()[0].replace(
                     '地点:', '')
         except IndexError:
             item['place'] = 'None'
         try:
             item['stime'] = Tu.unite(
                 each.xpath("./dl/dd[@class='date']/text()").extract()
                 [0].replace('时间:', ''))
         except IndexError:
             item['stime'] = 'None'
         yield item
     if self.offset < self.MAX_PAGE:
         self.offset += 1
         yield scrapy.Request(self.url + str(self.offset),
                              callback=self.parse)