Пример #1
0
    def parse(self, response):
        messages = response.xpath("//div[@class='box_info_list']/ul/li")
        report_num = 0

        print_new_number(self.counts, 'THU', self.name)
        for i in xrange(len(messages)):
            report_num += 1
            name = messages[i].xpath(".//a/text()").extract()
            report_name = name[0].strip() + name[1].strip()
            report_url = self.domain + messages[i].xpath(
                ".//a/@href").extract()[0][1:]

            if u'安排预告' in report_name:
                report_num -= 1
                continue
            if report_name == self.last_name:
                return
            elif report_num == 1:
                sent_first('THU', self.name, report_name)

            yield scrapy.Request(report_url,
                                 callback=self.parse_pages,
                                 meta={
                                     'link': report_url,
                                     'number': i + 1
                                 })
Пример #2
0
    def parse(self, response):
        links = response.xpath(
            "//div[@class='container row main in2']/div/ul/li/a/@href"
        ).extract()
        times = response.xpath(
            "//div[@class='container row main in2']/div/ul/li/span/text()"
        ).extract()

        l = len(links)
        print_new_number(self.counts, 'HFUT', self.name)
        for i in range(l):
            report_time = get_localtime(times[i])

            if report_time < now_time:
                return
            report_url = self.domain + links[i][1:]
            yield scrapy.Request(report_url,
                                 callback=self.parse_pages,
                                 meta={
                                     'link': report_url,
                                     'number': i + 1
                                 })

        number = int(response.url.split('-')[-1].split('.')[0])
        last_number = response.xpath(
            "//div[@id='pages']/a/text()").extract()[-2]
        if number < last_number:
            new_url = 'http://news.hfut.edu.cn/list-28-%d.html' % (number + 1)
            yield scrapy.Request(new_url, callback=self.parse)
        else:
            return
Пример #3
0
    def parse_pages(self, response):
        # title
        title = response.xpath(
            "//td[@height='30' and @align='center']/h4/text()").extract()[0]

        # crawling img
        url = response.xpath("//td[@align='left' and @class='cc']").xpath(
            './/img/@src').extract()[0][2:]

        # get img url
        img_domain = response.meta['link'].split('/')
        img_url = ''
        for i in range(len(img_domain) - 1):
            img_url += img_domain[i] + '/'
        img_url += url

        if title != '':
            self.counts += 1
            print_new_number(self.counts, 'USTC', self.name)

        # save
        all_messages = save_messages('USTC', self.name, title, '', '', '', '',
                                     '', img_url, response.meta['link'],
                                     response.meta['number'], u'中国科学技术大学',
                                     u'计算机科学与技术学院')

        return all_messages
Пример #4
0
    def parse_pages(self, response):
        messages = response.xpath("//td[@height='400']/p")

        title = response.xpath("//h4/text()").extract()[0].strip()

        time, address, speaker, img_url = '', '', '', ''
        for message in messages:
            text = self.get_messages(message)
            if u'时间:' in text or u'时间:' in text:
                time = self.connect_messages(
                    text, ':') if u'时间:' in text else self.connect_messages(
                        text, ':')
            if u'地点:' in text or u'地点:' in text:
                address = self.connect_messages(
                    text, ':') if u'地点:' in text else self.connect_messages(
                        text, ':')
            if u'报告人:' in text or u'报告人:' in text:
                speaker = self.connect_messages(
                    text, ':') if u'报告人:' in text else self.connect_messages(
                        text, ':')
            img = message.xpath(".//img/@src")
            img_url = (self.domain +
                       img.extract()[0][1:]) if len(img) > 0 else ''

        if title != '':
            self.counts += 1
            print_new_number(self.counts, 'THU', self.name)

        all_messages = save_messages('THU', self.name, title, time, address,
                                     speaker, '', '', img_url,
                                     response.meta['link'],
                                     response.meta['number'], u'清华大学',
                                     u'化学工程系')

        return all_messages
Пример #5
0
    def parse_pages(self, response):

        title = response.xpath("//h2/text()").extract()[0]

        # other message
        summary = response.xpath("//div[@id='artibody']/p")

        time = self.get_keys(summary[0])

        address = self.get_keys(summary[1])

        speaker = self.get_keys(summary[2])

        # We don't need organization and host
        # organization = self.get_keys(summary[3])

        # host = self.get_keys(summary[4])

        person_introduce, content = self.get_person_and_content(response)

        if title != '':
            self.counts += 1
            print_new_number(self.counts, 'HFUT', self.name)

        all_messages = save_messages('HFUT', self.name, title, time, address,
                                     speaker, person_introduce, content, '',
                                     response.meta['link'],
                                     response.meta['number'], u'合肥工业大学')

        return all_messages
Пример #6
0
    def parse_pages(self, response):
        messages = response.xpath("//dd[@class='info']").xpath(
            ".//text()").extract()

        sign = 0
        title, time, address, speaker, person_introduce, content = '', '', '', '', '', ''
        for message in messages:
            if u'题目:' in message or u'题目:' in message:
                title = self.connect_message(
                    message,
                    ':') if u'题目:' in message else self.connect_message(
                        message, ':')
            elif u'时间:' in message or u'时间:' in message:
                time = self.connect_message(
                    message,
                    ':') if u'时间:' in message else self.connect_message(
                        message, ':')
            elif u'地点:' in message or u'地点:' in message:
                address = self.connect_message(
                    message,
                    ':') if u'地点:' in message else self.connect_message(
                        message, ':')
            elif u'报告人:' in message or u'报告人:' in message:
                speaker = self.connect_message(
                    message,
                    ':') if u'报告人:' in message else self.connect_message(
                        message, ':')
            elif u'简介:' in message or u'简介:' in message:
                sign = 0
                person_introduce = self.connect_message(
                    message,
                    ':') if u'简介:' in message else self.connect_message(
                        message, ':')
            elif u'摘要:' in message or u'摘要:' in message:
                sign = 1
                content = self.connect_message(
                    message,
                    ':') if u'摘要:' in message else self.connect_message(
                        message, ':')
            elif u'邀请人' in message:
                break
            elif not sign:
                person_introduce += message.strip()
            elif sign:
                content += message.strip()
            else:
                pass

        if title != '':
            self.counts += 1
            print_new_number(self.counts, 'WHU', self.name)

        all_messages = save_messages('WHU', self.name, title, time, address,
                                     speaker, person_introduce, content, '',
                                     response.meta['link'],
                                     response.meta['number'], u'武汉大学',
                                     u'计算机学院')

        return all_messages
Пример #7
0
    def parse_pages(self, response):
        messages = response.xpath("//div[@class='TRS_Editor']").xpath(".//p")

        sign = 0
        title, speaker, time, address, content, person_introduce = '', '', '', '', '', ''
        for message in messages:
            text = self.get_text(message)
            if u'欢迎大家' in text or u'联系人' in text or u'紫金山天文台学术委员会' in text:
                continue
            elif u'题目:' in text or 'Title:' in text or u'题目:' in text or 'Title:' in text:
                title = self.connect_message(
                    text, ':'
                ) if u'题目:' in text or 'Title:' in text else self.connect_message(
                    text, ':')
            elif u'报告人:' in text or 'Speaker:' in text or u'主讲人:' in text or u'报告人:' in text or 'Speaker:' in text or u'主讲人:' in text:
                speaker = self.connect_message(
                    text, ':'
                ) if u'报告人:' in text or 'Speaker:' in text or u'主讲人:' in text else self.connect_message(
                    text, ':')
            elif u'时间:' in text or 'Time:' in text or u'时间:' in text or 'Time:' in text:
                time = self.connect_message(
                    text, ':'
                ) if u'时间:' in text or 'Time:' in text else self.connect_message(
                    text, ':')
            elif u'地点:' in text or 'Address:' in text or u'地点:' in text or 'Address:' in text:
                address = self.connect_message(
                    text, ':'
                ) if u'地点:' in text or 'Address:' in text else self.connect_message(
                    text, ':')
            elif u'简介:' in text or 'Bio:' in text or u'简介:' in text or 'Bio:' in text:
                sign = 1
                person_introduce = self.connect_message(
                    text, ':'
                ) if u'简介:' in text or 'Bio:' in text else self.connect_message(
                    text, ':')
            elif u'摘要:' in text or 'Abstract:' in text or u'摘要:' in text or 'Abstract:' in text:
                sign = 2
                content = self.connect_message(
                    text, ':'
                ) if u'摘要:' in text or 'Abstract:' in text else self.connect_message(
                    text, ':')
            else:
                if sign == 1:
                    person_introduce += text.strip()
                elif sign == 2:
                    content += text.strip()

        if title != '':
            self.counts += 1
            print_new_number(self.counts, 'USTC', self.name)

        all_messages = save_messages('USTC', self.name, title, time, address,
                                     speaker, person_introduce, content, '',
                                     response.meta['link'],
                                     response.meta['number'], u'中国科学技术大学')

        return all_messages
Пример #8
0
    def parse_pages(self, response):
        messages = response.xpath("//div[@class='box_detail']/p")
        print len(messages)

        sign = 0
        title, time, address, speaker, person_introduce, content = '', '', '', '', '', ''
        for message in messages:
            text = self.get_messages(message)
            if u'题目:' in text or u'题目:' in text:
                title = self.connect_messages(
                    text, ':') if u'题目:' in text else self.connect_messages(
                        text, ':')
            if u'时间:' in text or u'时间:' in text:
                time = self.connect_messages(
                    text, ':') if u'时间:' in text else self.connect_messages(
                        text, ':')
            if u'地点:' in text or u'地点:' in text:
                address = self.connect_messages(
                    text, ':') if u'地点:' in text else self.connect_messages(
                        text, ':')
            if u'报告人:' in text or u'报告人:' in text:
                speaker = self.connect_messages(
                    text, ':') if u'报告人:' in text else self.connect_messages(
                        text, ':')
            if u'简介:' in text or u'简介:' in text:
                sign = 1
                person_introduce = self.connect_messages(
                    text, ':') if u'简介:' in text else self.connect_messages(
                        text, ':')
            if u'摘要:' in text or u'摘要:' in text:
                sign = 2
                content = self.connect_messages(
                    text, ':') if u'摘要:' in text else self.connect_messages(
                        text, ':')
            else:
                if u'联系人' in text:
                    continue
                elif sign == 1:
                    person_introduce += '\n' + text
                elif sign == 2:
                    content += '\n' + text
                else:
                    pass

        if title != '':
            self.counts += 1
            print_new_number(self.counts, 'THU', self.name)

        all_messages = save_messages('THU', self.name, title, time, address,
                                     speaker, person_introduce, content, '',
                                     response.meta['link'],
                                     response.meta['number'], u'清华大学',
                                     u'数学科学系')

        return all_messages
Пример #9
0
	def parse(self, response):
		messages = response.xpath("//ul[@class='list-none metlist']/li")
		print_new_number(self.counts, 'USTC', self.name)

		for i in xrange(len(messages)):
			report_url = self.domain + messages[i].xpath(".//a/@href").extract()[0][2:]
			report_time = get_localtime(messages[i].xpath(".//span/text()").extract()[0].strip())

			if report_time < now_time:
				return
			yield scrapy.Request(report_url, callback=self.parse_pages, meta={'link': report_url, 'number': i + 1})
Пример #10
0
	def parse_pages(self, response):
		title = ''
		img_url = self.domain + str(response.meta['time'])[0:6] + response.xpath("//p[@align='center']/img/@src").extract()[0][1:]

		if title != '':
			self.counts += 1
			print_new_number(self.counts, 'USTC', self.name)

		all_messages = save_messages('USTC', self.name, '', '', '', '', '', '', img_url,
		                             response.meta['link'], response.meta['number'], u'中国科学技术大学')

		return all_messages
Пример #11
0
    def parse_pages(self, response):
        messages = response.xpath("//div[@id='vsb_content']/p")

        sign = 1
        title, time, address, speaker, person_introduce, content = '', '', '', '', '', ''
        for message in messages:
            text = self.get_messages(message)
            if u'题目:' in text or u'题目:' in text:
                title = self.connect_message(
                    text, ':') if u'题目:' in text else self.connect_message(
                        text, ':')
            elif u'主持人' in text:
                continue
            elif u'时间:' in text or u'时间:' in text:
                time = self.connect_message(
                    text, ':') if u'时间:' in text else self.connect_message(
                        text, ':')
            elif u'地点:' in text or u'地点:' in text:
                address = self.connect_message(
                    text, ':') if u'地点:' in text else self.connect_message(
                        text, ':')
            elif u'报告人:' in text or u'报告人:' in text:
                speaker = self.connect_message(
                    text, ':') if u'报告人:' in text else self.connect_message(
                        text, ':')
            elif u'简介:' in text or u'简介:' in text:
                sign = 1
                person_introduce = self.connect_message(
                    text, ':') if u'题目:' in text else self.connect_message(
                        text, ':')
            elif u'摘要:' in text or u'摘要:' in text:
                sign = 2
                content = self.connect_message(
                    text, ':') if u'题目:' in text else self.connect_message(
                        text, ':')
            else:
                if sign == 1:
                    person_introduce += text
                elif sign == 2:
                    content += text

        if title != '':
            self.counts += 1
            print_new_number(self.counts, 'NPU', self.name)

        all_messages = save_messages('NPU', self.name, title, time, address,
                                     speaker, person_introduce, content, '',
                                     response.meta['link'],
                                     response.meta['number'], u'西北工业大学',
                                     u'计算机学院')

        return all_messages
Пример #12
0
	def parse(self, response):
		messages = response.xpath("//td[@class='middle']").xpath(".//tr")
		print_new_number(self.counts, 'USTC', self.name)

		for i in xrange(len(messages)):
			report_title = messages[i].xpath(".//span/a/text()").extract()[0]
			report_url = self.domain + messages[i].xpath(".//span/a/@href").extract()[0]
			report_time = get_localtime(messages[i].xpath(".//span/a/text()").extract()[-1].strip('()'))
			if report_time < now_time:
				return
			if u'本周报告' in report_title:
				continue
			yield scrapy.Request(report_url, callback=self.parse_pages, meta={'link': report_url, 'number': i + 1})
Пример #13
0
	def parse(self, response):
		links = response.xpath("//li[@width='30%']/a/@href").extract()
		times = response.xpath("//li[@width='30%']/span/text()").extract()
		print_new_number(self.counts, 'USTC', self.name)

		l = len(links)
		for i in range(l):
			report_url = self.domain + links[i][2:]
			report_time = get_localtime(times[i])

			if report_time < now_time:
				return
			yield scrapy.Request(report_url, callback=self.parse_pages, meta={'link': report_url, 'number': i + 1})
Пример #14
0
    def parse(self, response):
        messages = response.xpath("//ul[@class='zjlt clearfix mt-45']/li")
        print_new_number(self.counts, 'THU', self.name)

        for i, message in enumerate(messages):
            report_url = self.domain + message.xpath(
                ".//div[@class='info fr']").xpath(
                    './/a/@href').extract()[0][1:]

            yield scrapy.Request(report_url,
                                 callback=self.parse_pages,
                                 meta={
                                     'link': report_url,
                                     'number': i + 1
                                 })
Пример #15
0
    def parse_pages(self, response):
        messages = response.xpath(
            "//div[@class='inside panels-flexible-region-inside panels-flexible-region-jcjz-center-inside panels-flexible-region-inside-first']/div"
        )

        # address
        address = self.get_message(messages[0])

        # time
        time = self.get_message(messages[1])

        # speaker
        speaker = self.get_message(messages[2])

        # other message: title, person_introduce, content
        img_url = ''
        person_introduce = ''
        title = ''
        content = ''
        for i in range(3, len(messages)):
            part_name = messages[i].xpath(
                ".//div[@class='field-label']/text()").extract()[0]
            img_exist = messages[i].xpath(".//img")
            if len(img_exist) != 0:
                img_url = self.get_img(messages[i])
            if u'报告人简介' in part_name:
                person_introduce = self.get_message(messages[i])
            elif u'题目' in part_name:
                title = self.get_message(messages[i])
            else:
                content = self.get_message(messages[i])
            # break
        if title == '':
            title = response.xpath("//h1/text()").extract()[0]
        if img_url != '':
            img_url = self.domain + img_url[1:]

        if title != '':
            self.counts += 1
            print_new_number(self.counts, 'USTC', self.name)

        all_messages = save_messages('USTC', self.name, title, time, address,
                                     speaker, person_introduce, content,
                                     img_url, response.meta['link'],
                                     response.meta['number'], u'中国科学技术大学',
                                     u'地球和空间科学学院')

        return all_messages
Пример #16
0
	def parse(self, response):
		messages = response.xpath("//div[@class='list']/ul/li")
		print_new_number(self.counts, 'USTC', self.name)

		for i in xrange(len(messages)):
			if u'青年论坛' in messages[i].xpath(".//a/text()").extract()[0]:
				report_url = messages[i].xpath(".//a/@href").extract()[0]
			else:
				report_url = self.domain + messages[i].xpath(".//a/@href").extract()[0][9:]
			if 'Colloquium' in report_url:
				continue
			report_time = get_localtime('20' + messages[i].xpath(".//span/text()").extract()[0].strip('[]'))

			if report_time < now_time:
				return
			yield scrapy.Request(report_url, callback=self.parse_pages, meta={'link': report_url, 'number': i + 1})
Пример #17
0
	def parse_pages(self, response):
		messages = response.xpath("//table[@width='96%']").xpath(".//td[@align='left' and @class='dh01']").xpath(".//text()").extract()

		sign = -1
		title = ''; time = ''; address = ''; speaker = ''; content = ''
		# the order of message is not stable, so we can only use the key words. And some messages not only have one section.
		for i in range(len(messages) - 1):
			if 'Title:' in messages[i] or u'题目:' in messages[i] or 'Title:' in messages[i] or u'题目:' in messages[i]:
				sign = 0
				title += self.get_message(messages[i], ':') if 'Title:' in messages[i] or u'题目:' in messages[i] else self.get_message(messages[i], ':')
			elif 'Time:' in messages[i] or u'时间:' in messages[i] or 'Time:' in messages[i] or u'时间:' in messages[i]:
				sign = 1
				time += self.get_message(messages[i], ':') if 'Time:' in messages[i] or u'时间:' in messages[i] else self.get_message(messages[i], ':')
			elif 'Place:' in messages[i] or u'地点:' in messages[i] or 'Place:' in messages[i] or u'地点:' in messages[i]:
				sign = 2
				address += self.get_message(messages[i], ':') if 'Place:' in messages[i] or u'地点:' in messages[i] else self.get_message(messages[i], ':')
			elif 'Speaker:' in messages[i] or u'报告人:' in messages[i] or 'Speaker:' in messages[i] or u'报告人:' in messages[i]:
				sign = 3
				speaker += self.get_message(messages[i], ':') if 'Speaker:' in messages[i] or u'报告人:' in messages[i] else self.get_message(messages[i], ':')
			elif 'Abstract:' in messages[i] or u'摘要:' in messages[i] or 'Abstract:' in messages[i] or u'摘要:' in messages[i]:
				sign = 4
				content += self.get_message(messages[i], ':') if 'Abstract:' in messages[i] or u'摘要:' in messages[i] else self.get_message(messages[i], ':')
			elif 'Abstract.' in messages[i] or u'摘要.' in messages[i]:
				sign = 4
				content += self.get_message(messages[i], '.')
			else:
				if u'欢迎' in messages[i]:
					pass
				elif sign == 0:
					title += messages[i]
				elif sign == 1:
					time += messages[i]
				elif sign == 2:
					address += messages[i]
				elif sign == 3:
					speaker += messages[i]
				elif sign == 4:
					content += messages[i]

		if title != '':
			self.counts += 1
			print_new_number(self.counts, 'USTC', self.name)

		all_messages = save_messages('USTC', self.name, title, time, address, speaker, '',
		                             content, '', response.meta['link'], response.meta['number'], u'中国科学技术大学', u'数学科学学院')

		return all_messages
Пример #18
0
    def parse(self, response):
        messages = response.xpath("//div[@id='container']/dl/dd")
        print_new_number(self.counts, 'WHU', self.name)

        for i in xrange(len(messages)):
            report_url = self.domain + messages[i].xpath(
                ".//a/@href").extract()[0][1:]
            report_time = get_localtime(
                messages[i].xpath(".//i/text()").extract()[0].split(' ')[0])
            if report_time < now_time:
                return
            yield scrapy.Request(report_url,
                                 callback=self.parse_pages,
                                 meta={
                                     'link': report_url,
                                     'number': i + 1
                                 })
Пример #19
0
	def parse(self, response):
		messages = response.xpath("//div[@id='container']/dl/dd")
		print_new_number(self.counts, 'WHU', self.name)

		for i in xrange(len(messages)):
			report_url = self.domain + messages[i].xpath(".//a/@href").extract()[0][1:]
			report_time = get_localtime(messages[i].xpath(".//i/text()").extract()[0].split(' ')[0])
			if report_time < now_time:
				return
			yield scrapy.Request(report_url, callback=self.parse_pages, meta={'link': report_url, 'number': i + 1})

		now_number = int(response.xpath("//div[@class='page fn_clear']/ul/li[@class='thisclass']/text()").extract()[0])
		last_number = int(response.xpath("//span[@class='pageinfo']/strong")[0].xpath(".//text()").extract()[0])

		if not (now_number < last_number):
			return
		new_url = 'http://cs.whu.edu.cn/a/xueshujiangzuofabu/list_39_{}.html'.format(now_number + 1)
		yield scrapy.Request(new_url, callback=self.parse)
Пример #20
0
    def parse_pages(self, response):
        messages = response.xpath("//div[@class='box_detail']/p")

        sign = 0
        title, time, address, content = '', '', '', ''
        speaker = ''
        for message in messages:
            text = get_messages(message)
            if '题目:' in text or '题目:' in text:
                title = connect_messages(
                    text, ':') if '题目:' in text else connect_messages(
                        text, ':')
            elif '时间:' in text or '时间:' in text:
                time = connect_messages(
                    text, ':') if '时间:' in text else connect_messages(
                        text, ':')
            elif '地点:' in text or '地点:' in text:
                address = connect_messages(
                    text, ':') if '地点:' in text else connect_messages(
                        text, ':')
            elif '人:' in text or '人:' in text:
                speaker = connect_messages(
                    text, ':') if '人:' in text else connect_messages(
                        text, ':')
            elif '摘要:' in text or '摘要:' in text:
                sign = 1
                content = connect_messages(
                    text, ':') if '摘要:' in text else connect_messages(
                        text, ':')
            elif sign == 1:
                content += text.strip()

        if title != '':
            self.counts += 1
            print_new_number(self.counts, 'THU', self.name)

        all_messages = save_messages('THU', self.name, title, time, address,
                                     speaker, '', content, '',
                                     response.meta['link'],
                                     response.meta['number'], u'清华大学', u'物理系')

        return all_messages
Пример #21
0
    def parse(self, response):
        messages = response.xpath(
            "//div[@class='view-content']/table/tbody/tr")
        print_new_number(self.counts, 'USTC', self.name)

        sign = 0
        for i in xrange(len(messages)):
            message = messages[i].xpath(".//td")
            report_url = self.domain + message[0].xpath(
                ".//a/@href").extract()[0][1:]
            report_class = message[1].xpath(".//text()").extract()[0].strip()
            report_time = get_localtime(
                message[2].xpath(".//text()").extract()[0].strip())
            if u'学术报告' not in report_class:
                continue
            if report_time < now_time:
                sign = 1
                continue
            yield scrapy.Request(report_url,
                                 callback=self.parse_pages,
                                 meta={
                                     'link': report_url,
                                     'number': i + 1
                                 })

        # The report time of this page is not sorted, so we only stop the procedure in the end of a page.
        if sign:
            return
        now_number = response.xpath(
            "//ul[@class='pager']/li[@class='pager-current first']/text()"
        ).extract()
        if len(now_number) == 0:
            now_number = int(
                response.xpath(
                    "//ul[@class='pager']/li[@class='pager-current']/text()").
                extract()[0])
        else:
            now_number = int(now_number[0])
        next_url = 'http://ess.ustc.edu.cn/notice?page=%d' % now_number

        yield scrapy.Request(next_url, callback=self.parse)
Пример #22
0
	def parse(self, response):
		messages = response.xpath("//td[@class='middle']").xpath(".//tr")
		print_new_number(self.counts, 'USTC', self.name)

		for i in xrange(len(messages)):
			report_title = messages[i].xpath(".//span/a/text()").extract()[0]
			report_url = self.domain + messages[i].xpath(".//span/a/@href").extract()[0]
			report_time = get_localtime(messages[i].xpath(".//span/a/text()").extract()[-1].strip('()'))
			if report_time < now_time:
				return
			if u'本周报告' in report_title:
				continue
			yield scrapy.Request(report_url, callback=self.parse_pages, meta={'link': report_url, 'number': i + 1})

		now_number = int(response.xpath("//a[@href='#']").xpath(".//text()").extract()[0])
		last_number = int(response.xpath("//a[@href='#']").xpath(".//text()").extract()[-1][1:])
		if now_number > last_number:
			return
		next_url = 'http://math.ustc.edu.cn/new/list.php?fid=35&page=%d' % (now_number + 1)

		yield scrapy.Request(next_url, callback=self.parse)
Пример #23
0
    def parse_pages(self, response):
        messages = response.xpath("//div[@class='bbs-info']")

        title = self.try_get_message(messages.xpath(".//h2/text()").extract())

        time = messages.xpath(".//p")[0].xpath(".//text()").extract()[1]

        address = messages.xpath(".//p")[1].xpath(".//text()").extract()[1]

        speaker = messages.xpath(".//p")[2].xpath(".//text()").extract()[1]

        other = response.xpath("//div[@class='show-new']")

        if len(other) == 0:
            content = ''
        else:
            content = other.xpath(".//text()").extract()[0].strip()
            if u'简介:' in content or 'Abstract:' in content or u'简介:' in content or 'Abstract:' in content:
                content = self.connect_messages(
                    content, ':'
                ) if u'简介:' in content or 'Abstract:' in content else self.connect_messages(
                    content, ':')
            else:
                pass

        report_time = get_localtime(
            response.xpath("//div[@class='wtime']/text()").extract()
            [0].strip())
        if report_time < now_time:
            title = ''
        else:
            self.counts += 1
        print_new_number(self.counts, 'THU', self.name)

        all_messages = save_messages('THU', self.name, title, time, address,
                                     speaker, '', content, '',
                                     response.meta['link'],
                                     response.meta['number'], u'清华大学')

        return all_messages
Пример #24
0
    def parse(self, response):
        messages = response.xpath("//div[@class='full-page-list']/ul/li")
        print_new_number(self.counts, 'SYSU', self.name)

        for i in xrange(len(messages)):
            report_name = messages[i].xpath(".//a/text()").extract()[0]
            if u'学术报告:' not in report_name and u'学术报告:' not in report_name:
                continue
            report_url = self.domains + messages[i].xpath(
                ".//a/@href").extract()[0][1:]
            report_time = get_localtime(
                messages[i].xpath(".//span/text()").extract()[0].replace(
                    '/', '-'))

            if report_time < now_time:
                return
            yield scrapy.Request(report_url,
                                 callback=self.parse_pages,
                                 meta={
                                     'link': report_url,
                                     'number': i + 1
                                 })
Пример #25
0
	def parse_pages(self, response):
		messages = response.xpath("//div[@class='justify']").xpath(".//p").xpath(".//text()").extract()

		sign = 0
		title, time, address, speaker, person_introduce, content = '', '', '', '', '', ''
		for message in messages:
			if u'题目:' in message or u'题目:' in message:
				title = self.connect_messages(message, ':') if u'题目:' in message else self.connect_messages(message, ':')
			elif u'时间:' in message or u'时间:' in message:
				time = self.connect_messages(message, ':') if u'时间:' in message else self.connect_messages(message, ':')
			elif u'地点:' in message or u'地点:' in message:
				address = self.connect_messages(message, ':') if u'地点:' in message else self.connect_messages(message, ':')
			elif u'报告人:' in message or u'报告人:' in message:
				speaker = self.connect_messages(message, ':') if u'报告人:' in message else self.connect_messages(message, ':')
			elif u'摘要:' in message or u'摘要:' in message:
				sign = 1
				content = self.connect_messages(message, ':') if u'摘要:' in message else self.connect_messages(message, ':')
			elif u'简介:' in message or u'简介:' in message:
				sign = 2
				person_introduce = self.connect_messages(message, ':') if u'简介:' in message else self.connect_messages(message, ':')
			else:
				if u'联系人' in message:
					continue
				if sign == 1:
					content += '\n' + message.strip()
				elif sign == 2:
					person_introduce += '\n' + message.strip()
				else:
					pass

		if title != '':
			self.counts += 1
			print_new_number(self.counts, 'USTC', self.name)

		all_messages = save_messages('USTC', self.name, title, time, address, speaker, person_introduce,
		                             content, '', response.meta['link'], response.meta['number'], u'中国科学技术大学')

		return all_messages
Пример #26
0
    def parse(self, response):
        messages = response.xpath(
            "//div[@class='view-content']/table/tbody/tr")
        print_new_number(self.counts, 'USTC', self.name)

        sign = 0
        for i in xrange(len(messages)):
            message = messages[i].xpath(".//td")
            report_url = self.domain + message[0].xpath(
                ".//a/@href").extract()[0][1:]
            report_class = message[1].xpath(".//text()").extract()[0].strip()
            report_time = get_localtime(
                message[2].xpath(".//text()").extract()[0].strip())
            if u'学术报告' not in report_class:
                continue
            if report_time < now_time:
                sign = 1
                continue
            yield scrapy.Request(report_url,
                                 callback=self.parse_pages,
                                 meta={
                                     'link': report_url,
                                     'number': i + 1
                                 })
Пример #27
0
    def parse_pages(self, response):
        title = ''
        for text in response.xpath("//h2").xpath(".//text()").extract():
            title += text.strip()

        messages = response.xpath("//div[@class='box_detail']/p").xpath(
            ".//text()").extract()
        sign = 0
        time, address, speaker, person_introduce, content = '', '', '', '', ''
        for message in messages:
            if 'Time:' in message or 'Time:' in message:
                sign = 1
                time = self.get_messages(
                    message, ':') if 'Time:' in message else self.get_messages(
                        message, ':')
            elif 'Venue:' in message or 'Meeting Room:' in message or 'Location:' in message or 'Venue:' in message or 'Meeting Room:' in message or 'Location:' in message:
                sign = 2
                address = self.get_messages(
                    message, ':'
                ) if 'Venue:' in message or 'Meeting Room:' in message or 'Location:' in message else self.get_messages(
                    message, ':')
            elif 'Speaker:' in message or 'Speaker:' in message:
                sign = 3
                speaker = self.get_messages(
                    message,
                    ':') if 'Speaker:' in message else self.get_messages(
                        message, ':')
            elif 'Bio:' in message or 'Biography:' in message or 'Bio:' in message or 'Biography:' in message:
                sign = 4
                person_introduce = self.get_messages(
                    message, ':'
                ) if 'Bio:' in message or 'Biography:' in message else self.get_messages(
                    message, ':')
            elif 'Abstract:' in message or 'Abstract:' in message:
                sign = 5
                content = self.get_messages(
                    message,
                    ':') if 'Abstract:' in message else self.get_messages(
                        message, ':')
            else:
                if sign == 1:
                    time += '\n' + message.strip()
                elif sign == 2:
                    address += '\n' + message.strip()
                elif sign == 3:
                    speaker += '\n' + message.strip()
                elif sign == 4:
                    person_introduce += '\n' + message.strip()
                elif sign == 5:
                    content += '\n' + message.strip()
                else:
                    pass

        if title != '':
            self.counts += 1
            print_new_number(self.counts, 'THU', self.name)

        all_messages = save_messages('THU', self.name, title, time, address,
                                     speaker, person_introduce, content, '',
                                     response.meta['link'],
                                     response.meta['number'], u'清华大学',
                                     u'计算机科学与技术系')

        return all_messages
Пример #28
0
    def parse_pages(self, response):
        messages = response.xpath("//div[@class='field-items']").xpath(".//p")

        sign = 0
        title, time, address, speaker, person_introduce, content, date = '', '', '', '', '', '', ''
        for message in messages:
            text, replace = self.connect_messages(
                message.xpath(".//text()").extract())
            if u'题目:' in replace or 'Title:' in replace or u'题目:' in replace or 'Title:' in replace:
                title = self.get_messages(
                    text, ':'
                ) if u'题目:' in replace or 'Title:' in replace else self.get_messages(
                    text, ':')
            elif u'时间' in replace or 'Time:' in replace or u'时间:' in replace or 'Time:' in replace:
                time = self.get_messages(
                    text, ':'
                ) if u'时间:' in replace or 'Time:' in replace else self.get_messages(
                    text, ':')
            elif u'地点:' in replace or 'Address:' in replace or u'地点:' in replace or 'Address:' in replace:
                address = self.get_messages(
                    text, ':'
                ) if u'地点:' in replace or 'Address:' in replace else self.get_messages(
                    text, ':')
            elif u'主讲:' in replace or u'报告人:' in replace or 'Speaker:' in replace or u'主讲:' in replace or u'报告人:' in replace or 'Speaker:' in replace:
                speaker = self.get_messages(
                    text, ':'
                ) if u'主讲:' in replace or u'报告人:' in replace or 'Speaker:' in replace else self.get_messages(
                    text, ':')
            elif u'日期:' in replace or 'Date:' in replace or u'日期:' in replace or 'Date:' in replace:
                date = self.get_messages(
                    text, ':'
                ) if u'日期:' in replace or 'Date:' in replace else self.get_messages(
                    text, ':')
            elif u'地点:' in text or 'Address:' in replace or u'地点:' in replace or 'Address:' in replace:
                address = self.get_messages(
                    text, ':'
                ) if u'地点:' in replace or 'Address:' in replace else self.get_messages(
                    text, ':')
            elif u'简介:' in text or 'Biography:' in replace or 'Bio:' in replace or u'简介:' in replace or 'Biography:' in replace or 'Bio:' in replace:
                sign = 1
                person_introduce = self.get_messages(
                    text, ':'
                ) if u'简介:' in replace or 'Biography:' in replace or 'Bio:' in replace else self.get_messages(
                    text, ':')
            elif u'摘要:' in replace or 'Abstract:' in replace or u'摘要:' in replace or 'Abstract:' in replace:
                sign = 2
                content = self.get_messages(
                    text, ':'
                ) if u'摘要:' in replace or 'Abstract:' in replace else self.get_messages(
                    text, ':')
            else:
                if sign == 1:
                    person_introduce += text
                elif sign == 2:
                    content += text
        time = (date + ' ' + time).strip()

        if title != '':
            self.counts += 1
            print_new_number(self.counts, 'SYSU', self.name)

        all_messages = save_messages('SYSU', self.name, title, time, address,
                                     speaker, person_introduce, content, '',
                                     response.meta['link'],
                                     response.meta['number'], u'中山大学',
                                     u'数据科学与计算机学院')

        return all_messages