def parse(self, response): for each in response.xpath('//div[@class="div_conter"]'): item = ProjectSpidersItem() item['name'] = each.xpath("./p/a/@title").extract()[0].strip() item['url'] = response.urljoin( each.xpath("./p/a/@href").extract()[0]) item['department'] = self.name detail = each.xpath("./div/div/p/text()").extract() if len(detail) == 1: detail = detail[0].strip().replace(' ', '').replace( ' ', '').replace('\xa0', '') detail = detail.split('\n') for it in detail: it = it.strip() if it.find('时间') != -1: if self.pas_month < int(it[3:it.find('月')]): self.re_year -= 1 item['stime'] = Tu.bio(it[3:].strip(), self.re_year) self.pas_month = int(item['stime'].split('-')[1]) if it.find('地点') != -1: item['place'] = it[3:].strip() if it.find('报告') != -1: item['speaker'] = it[4:].strip() if len(detail) > 1: item['speaker'] = 'None' item['place'] = 'None' item['stime'] = 'None' for it in detail: it = it.strip().replace(' ', '').replace(' ', '').replace( '\xa0', '') if it.find('年') != -1: continue if it.find('时间') != -1: if it.find('月') != -1: if self.pas_month < int(it[3:it.find('月')]): self.re_year -= 1 item['stime'] = Tu.bio(it[3:].strip(), self.re_year) self.pas_month = int(item['stime'].split('-')[1]) if it.find('地点') != -1: item['place'] = it[3:].strip() if it.find('报告') != -1: item['speaker'] = it[4:].strip() if it.find('演讲') != -1: item['speaker'] = it[4:].strip() yield item if self.offset < self.MAX_PAGE: self.offset += 1 yield scrapy.Request(self.url + str(self.offset), callback=self.parse)
def parse(self, response): for each in response.xpath('//ul[@class="info-list"]/li'): item = ProjectSpidersItem() item['name'] = each.xpath("./a/text()").extract()[0].strip() item['url'] = response.urljoin( each.xpath("./a/@href").extract()[0]) item['department'] = self.name item['stime'] = 'None' item['speaker'] = 'None' item['place'] = 'None' for detail in each.xpath('./div/ul/li'): if detail.xpath('./strong/text()').extract()[0].find( '时间') != -1: item['stime'] = Tu.unite( detail.xpath('./span/text()').extract()[0]) if detail.xpath('./strong/text()').extract()[0].find( '地点') != -1: item['place'] = detail.xpath('./span/text()').extract()[0] if detail.xpath('./strong/text()').extract()[0].find( '演讲') != -1: item['speaker'] = detail.xpath( './span/text()').extract()[0] yield item if self.offset < self.MAX_PAGE: self.offset += 1 yield scrapy.Request(self.url + str(self.offset) + '.html', callback=self.parse)
def parse(self, response): for each in response.xpath('//div[@class="newsAcademicListRow"]'): item = ProjectSpidersItem() item['name'] = each.xpath("./p[1]/a/text()").extract()[0] item['url'] = response.urljoin( each.xpath("./p[1]/a/@href").extract()[0]) item['department'] = self.name try: index = 2 item['speaker'] = item['place'] = item['stime'] = 'None' while True: text = each.xpath("./p[" + str(index) + "]/text()").extract()[0] if text.find('演讲者') > -1: item['speaker'] = text.replace('演讲者:', '') if text.find('地点') > -1: item['place'] = text.replace('地点:', '') if text.find('时间') > -1: item['stime'] = Tu.unite(text.replace('时间:', '')) index += 1 except IndexError: pass yield item if self.offset < self.MAX_PAGE: self.offset += 1 yield scrapy.Request(self.url + str(self.offset), callback=self.parse)
def parse(self, response): for each in response.xpath('//div[@class="section"]'): item = ProjectSpidersItem() item['name'] = each.xpath("./dl/dt/a/text()").extract()[0].replace('\r\n', '').strip(' ') item['url'] = response.urljoin(each.xpath("./dl/dt/a/@href").extract()[0]) item['department'] = self.name text_list = each.xpath("./div/text()").extract() if len(text_list) > 0: item['stime'] = Tu.unite(text_list[0]) else: item['stime'] = 'None' text_list = each.xpath("./dl/dd[1]/text()").extract() if len(text_list) > 0: item['speaker'] = text_list[0] else: item['speaker'] = 'None' text_list = each.xpath("./dl/dd[3]/text()").extract() if len(text_list) > 0: item['place'] = text_list[0] else: item['place'] = 'None' yield item if self.offset < self.MAX_PAGE: self.offset += 1 yield scrapy.Request(self.url + str(self.offset), callback=self.parse)
def parse(self, response): for each in response.xpath('/html/body/div[3]/div[2]/div[2]/div/ul/li'): item = ProjectSpidersItem() item['name'] = each.xpath("./div[2]/h3/a/text()").extract()[0] item['url'] = response.urljoin(each.xpath("./div[2]/h3/a/@href").extract()[0]) item['department'] = self.name detail = each.xpath("./div[2]/p/text()").extract()[0] if detail.find("演讲者") > -1: item['speaker'] = detail[detail.find("演讲者") + 4:] else: item['speaker'] = 'None' if detail.find("时间") > -1: stime = None if detail.find("日", 0, 15) > -1: stime = detail[detail.find("时间") + 3:detail.find("日")+1] if detail.find("号", 0, 15) > -1: stime = detail[detail.find("时间") + 3:detail.find("号")+1] item['stime'] = Tu.ess(stime) else: item['stime'] = 'None' if detail.find("地点") > -1: item['place'] = detail[detail.find("地点") + 3:detail.find("演讲者") - 3] else: item['place'] = 'None' yield item if self.offset < self.MAX_PAGE: self.offset += 1 yield scrapy.Request(self.url + str(self.offset) + '.html', callback=self.parse)
def parse(self, response): div_list = response.xpath('//div[@class="ch_content_dynamic"]/div') div_list.pop(0) for each in div_list: item = ProjectSpidersItem() item['name'] = each.xpath("./a/div[2]/div[1]/text()").extract()[0] item['url'] = response.urljoin(each.xpath("./a/@href").extract()[0]) item['department'] = self.name text_list = each.xpath("./a/div[2]/p/span/text()").extract() if len(text_list) > 0: item['stime'] = Tu.unite(text_list[0]) else: item['stime'] = 'None' text_list = each.xpath("./a/div[2]/div[2]/div/span[1]/text()").extract() if len(text_list) > 0: item['speaker'] = text_list[0] else: item['speaker'] = 'None' text_list = each.xpath("./a/div[2]/div[2]/div/span[3]/text()").extract() if len(text_list) > 0: item['place'] = text_list[0] else: item['place'] = 'None' yield item if self.offset < self.MAX_PAGE: self.offset += 1 yield scrapy.Request(self.url + str(self.offset) + '.html', callback=self.parse)
def parse(self, response): for each in response.xpath('/html/body/div[5]/div/div/ul/li'): item = ProjectSpidersItem() item['department'] = self.name item['name'] = each.xpath('./div/h4/a/text()').extract()[0] item['url'] = response.urljoin( each.xpath('./div/h4/a/@href').extract()[0]) publish_year = each.xpath('./div/span/text()').extract()[0] publish_day = each.xpath('./div/span/text()').extract()[1].replace( '/', '-') try: index = 1 item['speaker'] = item['place'] = item['stime'] = 'None' while True: text = each.xpath('./div/div/span[' + str(index) + ']/text()').extract()[0] if text.find('演讲者') > -1: item['speaker'] = text.replace('演讲者:', '') if text.find('地点') > -1: item['place'] = text.replace('地点:', '') if text.find('时间') > -1: item['stime'] = text.replace('时间:', '') old_time = text.replace('时间:', '') item['stime'] = Tu.physic( old_time, publish_year + '-' + publish_day) index += 1 except IndexError: pass yield item if self.offset < self.MAX_PAGE: self.offset += 1 yield scrapy.Request(self.url + str(self.offset) + '.html', callback=self.parse)
def parse(self, response): for each in response.xpath('//ul[@class="lec-ul"]/li'): item = ProjectSpidersItem() item['name'] = each.xpath("./a/div/p[1]/text()").extract()[0].strip() item['url'] = response.urljoin(each.xpath("./a/@href").extract()[0]) item['department'] = self.name item['speaker'] = each.xpath("./a/div/p[2]/span[1]/text()").extract()[0].replace('演讲者:', '') item['place'] = each.xpath("./a/div/p[2]/span[3]/text()").extract()[0].replace('地点:', '') item['stime'] = Tu.ese(each.xpath("./a/div/p[2]/span[2]/text()").extract()[0].replace('时间:', '')) yield item if self.offset < self.MAX_PAGE: self.offset += 1 yield scrapy.Request(self.url + str(self.offset), callback=self.parse)
def parse(self, response): rs = json.loads(response.body) item = ProjectSpidersItem() self.MAX_PAGE = rs['totle'] for value in rs['data']: item['name'] = value['lecture_title'] item['department'] = self.name item['speaker'] = value['lecture_speaker'] item['stime'] = Tu.unite(value['lecture_addtime']) item['place'] = value['lecture_address'] item['url'] = response.urljoin('/views/details_lecture.html?id=' + str(value['id'])) yield item if self.offset < self.MAX_PAGE: self.offset += 1 yield scrapy.Request(self.url + str(self.offset) + '&limit=4&language=cn', callback=self.parse)
def parse(self, response): for each in response.xpath('//div[@class="nnews3_lb3_m"]'): item = ProjectSpidersItem() item['name'] = each.xpath('./div[1]/text()').extract()[0].strip() item['url'] = 'http://fin.sustech.edu.cn/news3.aspx?TypeId=126&FId=t2:126:2' item['department'] = self.name try: item['speaker'] = each.xpath('./div[2]/div[1]/text()').extract()[1].strip() if item['speaker'] == '\\' or item['speaker'] == '': continue except IndexError: continue try: item['place'] = each.xpath('./div[2]/div[3]/span[2]/text()').extract()[0].strip() except IndexError: item['place'] = 'None' item['stime'] = Tu.fin(each.xpath('./div[2]/div[2]/text()').extract()[1].strip()) yield item
def parse(self, response): for each in response.xpath('//li[@id="xwdt1_li1"]'): item = ProjectSpidersItem() item['name'] = each.xpath("./a/div[2]/h5/text()").extract()[0].replace('\r\n', '').strip(' ') item['url'] = response.urljoin(each.xpath("./a/@href").extract()[0]) item['department'] = self.name item['speaker'] = item['stime'] = item['place'] = 'None' detail_list = each.xpath("./a/div[2]/div") for detail in detail_list: text = detail.xpath("./text()").extract() if len(text) > 0: if '演讲' in text[0]: item['speaker'] = text[0][4:] if '地点' in text[0]: item['place'] = text[0][3:] detail_list = each.xpath("./a/div[2]/p") for detail in detail_list: text = detail.xpath("./text()").extract() if len(text) == 1: if '演讲' in text[0]: item['speaker'] = text[0][4:] if '地点' in text[0]: item['place'] = text[0][3:] if len(text) > 1: for d_text in text: if '演讲' in d_text: item['speaker'] = d_text.strip()[4:] if '地点' in d_text: item['place'] = d_text.strip()[3:] detail_list = each.xpath("./a/div[2]/div[1]/p") for detail in detail_list: text = detail.xpath("./text()").extract() if len(text) > 0: if '演讲' in text[0]: item['speaker'] = text[0][4:] if '地点' in text[0]: item['place'] = text[0][3:] item['stime'] = Tu.unite(self.get_clear_time(item['url'])) yield item if self.offset < self.MAX_PAGE: self.offset += 1 yield scrapy.Request(self.url + str(self.offset), callback=self.parse)
def parse(self, response): for each in response.xpath('//ul[@class="info-list"]/li'): item = ProjectSpidersItem() item['name'] = each.xpath("./div/h4/a/text()").extract()[0].strip() item['url'] = response.urljoin( each.xpath("./div/h4/a/@href").extract()[0]) item['department'] = self.name item['speaker'] = each.xpath( "./div/div/ul/li[1]/span/text()").extract()[0].replace( '演讲者:', '') try: item['place'] = each.xpath( "./div/div/ul/li[3]/span/text()").extract()[0].replace( '地点:', '') except IndexError: item['place'] = 'None' item['stime'] = Tu.unite( each.xpath("./div/div/ul/li[2]/span/text()").extract() [0].replace('时间:', '')) yield item
def parse(self, response): for each in response.xpath('//li[@class="data-item "]'): item = ProjectSpidersItem() item['name'] = each.xpath("./a/span[2]/span[2]/span[1]/span/text()") \ .extract()[0].strip().replace('Lecture:', '') item['url'] = response.urljoin( each.xpath("./a/@href").extract()[0]) item['department'] = self.name item['speaker'] = each.xpath("./a/span[2]/span[2]/span[2]/span[1]/text()") \ .extract()[0].replace('Speaker:', '') try: item['place'] = each.xpath("./a/span[2]/span[2]/span[2]/span[3]/text()") \ .extract()[0].replace('Location:', '') except IndexError: item['place'] = 'None' item['stime'] = Tu.med( each.xpath("./a/span[2]/span[2]/span[2]/span[2]/text()"). extract()[0].replace('Time:', '')) yield item if len(response.xpath('//li[@class="data-item "]')) > 0: self.offset += 1 yield scrapy.Request(self.url + str(self.offset), callback=self.parse)
def parse(self, response): for each in response.xpath('//div[@class="ny_tzgg clearfix"]/div'): item = ProjectSpidersItem() item['name'] = each.xpath("./a/div[2]/h3/text()").extract()[0].strip() item['url'] = response.urljoin(each.xpath("./a/@href").extract()[0]) item['department'] = self.name year = each.xpath("./a/div[1]/div[2]/text()").extract()[1] item['stime'] = Tu.bme(each.xpath('./a/div[2]/p[1]/text()').extract()[0], year) item['place'] = each.xpath('./a/div[2]/p[2]/text()').extract()[0] try: item['speaker'] = each.xpath('./a/div[2]/p[3]/span/text()').extract()[0] except IndexError: if item['name'].find(':') != -1: item['speaker'] = item['name'].split(':')[0] else: if item['name'].find(':') != -1: item['speaker'] = item['name'].split(':')[0] else: item['speaker'] = 'None' yield item if self.offset < self.MAX_PAGE: self.offset += 1 yield scrapy.Request(self.url + str(self.offset), callback=self.parse)
def parse(self, response): try: self.MAX_PAGE = int( response.xpath('//li[@class="last"]/a/@href').extract() [0].split('/')[-1]) except IndexError: pass for each in response.xpath('//div[@class="main"]'): item = ProjectSpidersItem() item['name'] = each.xpath("./dl/dt/a/text()").extract()[0] item['url'] = response.urljoin( each.xpath("./dl/dt/a/@href").extract()[0]) item['department'] = self.name try: item['speaker'] = each.xpath( "./dl/dd[@class='name']/text()").extract()[0].replace( '演讲者:', '') except IndexError: item['speaker'] = 'None' try: item['place'] = each.xpath( "./dl/dd[@class='place']/text()").extract()[0].replace( '地点:', '') except IndexError: item['place'] = 'None' try: item['stime'] = Tu.unite( each.xpath("./dl/dd[@class='date']/text()").extract() [0].replace('时间:', '')) except IndexError: item['stime'] = 'None' yield item if self.offset < self.MAX_PAGE: self.offset += 1 yield scrapy.Request(self.url + str(self.offset), callback=self.parse)