예제 #1
0
 def parse(self, response):
     for course in response.css('a.course-box'):
         item=MultipageCourseItem()
         item['name']=course.xpath('.//div[@class="course-name"]/text()').extract_first()
         item['image']=course.xpath('.//img/@src').extract_first()
         course_url=response.urljoin(course.xpath('@href').extract_first())
         request=scrapy.Request(url=course_url,callback=self.parse_author)
         request.meta['item']=item
         yield request
예제 #2
0
 def parse(self, response):
     for course in response.css('div.col-md-3'):
         item = MultipageCourseItem(
             name=course.css(
                 'h6.course-name::text').extract_first().strip(),
             image=course.css('img.cover-image::attr(src)').extract_first())
         course_url = course.css('a::attr(href)').extract_first()
         full_course_url = response.urljoin(course_url)
         request = scrapy.Request(full_course_url, self.parse_author)
         request.meta['item'] = item
         yield request
예제 #3
0
 def parse(self, response):
     for course in response.css('div.col-10'):
         item = MultipageCourseItem(
             name=course.css('h3 a::text').extract_first().split(),
             update_time=course.css(
                 'relative-time::attr(datetime)').extract_first().split())
         #course_url = str(course.css('a.name.codeRepository::attr(href)').extract_first())
         full_course_url = response.urljoin(
             course.xpath('.//a/@href').extract_first())
         request = scrapy.Request(full_course_url,
                                  callback=self.parse_details)
         request.meta['item'] = item
         yield request
예제 #4
0
파일: multipage.py 프로젝트: Er876/class20
 def parse(self, response):
     for course in response.css('a.course-box'):
         item = MultipageCourseItem()
         # 解析课程名称
         item['name'] = course.xpath('.//div[@class="course-name"]/text()').extract_first()
         # 解析课程图片
         item['image'] = course.xpath('.//img/@src').extract_first()
         # 构造课程详情页面的链接,爬取到的链接是相对链接,调用 urljoin 方法构造全链接
         course_url = response.urljoin(course.xpath('@href').extract_first())
         # 构造到课程详情页的请求,指定回调函数
         request = scrapy.Request(course_url, callback=self.parse_author)
         # 将未完成的 item 通过 meta 传入 parse_author
         request.meta['item'] = item
         yield request
예제 #5
0
 def parse(self, response):
     for course in response.css('div.col-md-3'):
         item = MultipageCourseItem(
             # 课程名称
             name=course.css(
                 'h6.course-name::text').extract_first().strip(),
             # 课程图片
             image=course.css('img.cover-image::attr(src)').extract_first())
         # 构造课程详情页面的链接,爬到的链接是相对链接,调用urljon构造
         course_url = course.css('a::attr(href)').extract_first()
         full_course_url = response.urljoin(course_url)
         # 构造课程详情页的请求,指定回调函数
         request = scrapy.Request(full_course_url, self.parse_author)
         request.meta['item'] = item
         yield request
예제 #6
0
    def parse(self, response):
        for course in response.css('a.course-box'):
            item = MultipageCourseItem()
            item['name'] = course.xpath(
                './/div[@class="course-name"]/text()').extract_first()
            item['image'] = course.xpath('.//img/@src').extract_first()

            # response.urljoin() 可以身成URL
            course_url = response.urljoin(
                course.xpath('@href').extract_first())
            # yield request 会身成一个response
            # scrapy.Request(url,callback) 产生一个request
            # request.mata[''] 可以在 request中存放 内容 (实现页面跟随)
            request = scrapy.Request(course_url, callback=self.parse_author)
            request.meta['item'] = item
            yield request
예제 #7
0
    def parse(self, response):
        for course in response.css('div.col-10'):
            item = MultipageCourseItem(
                name=course.css('h3 a::text').extract_first().split(),
                update_time=course.css(
                    'relative-time::attr(datetime)').extract_first().split())
            #course_url = str(course.css('a.name.codeRepository::attr(href)').extract_first())
            full_course_url = response.urljoin(
                course.xpath('.//a/@href').extract_first())
            request = scrapy.Request(full_course_url,
                                     callback=self.parse_details)
            request.meta['item'] = item
            yield request

        spans = response.xpath(
            '//div[@class="BtnGroup"]/button[@disabled="disabled"]/text()')
        if len(spans) == 0:
            next_url = response.xpath(
                '//div[@class="BtnGroup"]/a/@href').extract()[1]
            yield response.follow(next_url, callback=self.parse)
        elif spans[0] == 'Previous':
            next_url = response.xpath(
                '//div[@class="BtnGroup"]/a/@href').extract()[0]
            yield response.follow(next_url, callback=self.parse)