def parse(self, response):
        # with open("teacher.html", 'wb') as f:
        #     f.write(response.body)  # 读取响应文件内容

        # 所有老师列表集合
        teacherItem = []

        for each in response.xpath('//div[@class = "li_txt"]'):
            # 将我们得到的数据封装到一个 `MyspiderItem` 对象
            item = ItcastItem()

            # 通过extract()转换为unicode字符串
            # 不加extract()就是xpath匹配的对象而已
            name = each.xpath(
                './h3/text()').extract()  # xpath返回的都是列表,元素根据匹配规则来(e.g. text())
            title = each.xpath('./h4/text()').extract()
            info = each.xpath('./p/text()').extract()

            item['name'] = name[0]
            item['title'] = title[0]
            item['info'] = info[0]

            teacherItem.append(item)
            # 直接返回数据,用于保存类型
        return teacherItem
 def parse_detail(self, response):
     res = BeautifulSoup(response.body)
     itcastItem = ItcastItem()
     itcastItem['title'] = res.select('h1')[0].text
     itcastItem['content'] = res.select('p')[0].text
     itcastItem['time'] = res.select('.ndArticle_creat')[0].text
     return itcastItem
示例#3
0
    def parse(self, response):
        node_list = response.xpath("//div[@class='li_txt']")
        # 存放老师信息的列表
        # items = []
        for node in node_list:
            # 创建一个Item对象,用来存储信息
            item = ItcastItem()

            #extract()方法是将node.xpath()返回的对象转换为unicode字符串
            names = node.xpath("./h3/text()").extract()
            titles = node.xpath("./h4/text()").extract()
            infos = node.xpath("./p/text()").extract()
            #xpath() 返回的是一个列表,需要通过索引[0]访问。
           
            #print("name:%s"%names)
            #print("title:%s:"%titles)

            item['name'] = names[0]
            item['title'] = titles[0]
            item['info'] = infos[0]

            #items.append(item)
            
            #将获取的数据交给管道pipeline,实时的处理数据,不用在内存堆积。
            yield item
示例#4
0
 def parse(self, response):
     node_list = response.xpath("//div[@class='li_txt']")
     for node in node_list:
         item = ItcastItem()
         item['name'] = node.xpath("./h3/text()").extract()[0]
         item['title'] = node.xpath("./h4/text()").extract()[0]
         item['info'] = node.xpath("./p/text()").extract()[0]
         yield item
示例#5
0
 def parse(self, response):
     node_list = response.xpath("//div[@class='li_txt']")
     for node in node_list:
         item = ItcastItem()
         name = node.xpath("./h3/text()").extract_first()
         title = node.xpath("./h4/text()").extract_first()
         info = node.xpath("./p/text()").extract_first()
         print(name, title, info)
         item['name'] = name
         item['title'] = title
         item['info'] = info
         yield item
示例#6
0
文件: itcast.py 项目: andy0andy/crawl
    def parse(self, response):

        li_txt = response.xpath("//div[@class='li_txt']")

        for node in li_txt:
            # 创建item字段,用来存储数据
            item = ItcastItem()

            item["name"] = node.xpath("./h3/text()").get()
            item["title"] = node.xpath("./h4/text()").get()
            item["info"] = node.xpath("./p/text()").get()

            # 将数据返回给 pipelines管道
            yield item
示例#7
0
文件: itcast.py 项目: bhcqzf/baiming
 def parse(self, response):
     # print(response.body.decode('utf-8'))
     node_list = response.xpath('//*[@class ="main_mask"]')
     # items=[]
     for node in node_list:
         # 创建一个item对象
         item = ItcastItem()
         name = node.xpath('./h2/text()').extract()
         title = node.xpath('./h2/span/text()').extract()
         info = node.xpath('./p/text()').extract()
         item['name'] = name[0]
         item['title'] = title[0]
         item['info'] = info[0].strip()
         yield item
示例#8
0
    def parse(self, response):
        #pass
        #print(response.body)
        node_list = response.xpath("//div[@class='li_txt']")
        for node in node_list:
            item = ItcastItem()
            name = node.xpath("./h3/text()").extract()
            title = node.xpath("./h4/text()").extract()
            info = node.xpath("./p/text()").extract()

            item['name'] = str(name[0])
            item['title'] = str(title[0])
            item['info'] = str(info[0])
            yield item
示例#9
0
    def parse(self, response):
        node_list = response.xpath("//div[@class='li_txt']")
        #items = []
        for node in node_list:
            item = ItcastItem()

            name = node.xpath("./h3/text()").extract()
            title = node.xpath("./h4/text()").extract()
            info = node.xpath("./p/text()").extract()

            item['name'] = name[0]
            item['title'] = title[0]
            item['info'] = info[0]
            #items.append(item)
            yield item
示例#10
0
    def parse(self, response):
        # items = []
        node_list = response.xpath("//div[@class='li_txt']")
        for node in node_list:
            item = ItcastItem()

            name = node.xpath("./h3/text()").extract()
            title = node.xpath("./h4/text()").extract()
            info = node.xpath("./p/text()").extract()

            item['name'] = name[0]
            item['title'] = title[0]
            item['info'] = info[0]

            # 返回提取到的每个 item 数据,给管道文件处理,同时返回回来,继续执行后面的代码(循环)
            yield item
示例#11
0
    def parse(self, response):
        node_list = response.xpath("//div[@class='li_txt']")
        for node in node_list:
            # 创建item字段对象,用来存储信息
            item = ItcastItem()

            # .extract() 将xpath对象转换为Unicode字符串
            name = node.xpath("./h3/text()").extract()
            title = node.xpath("./h4/text()").extract()
            info = node.xpath("./p/text()").extract()

            item['name'] = name[0]
            item['title'] = title[0]
            item['info'] = info[0]
            # 返回提取到的每一个item数据 给管道文件处理,同时还会回来继续执行后面的代码
            yield item
示例#12
0
 def parse(self, response):
     node_list = response.xpath('//div[@class="li_txt"]')
     #items = []
     for node in node_list:
         item = ItcastItem()
         item['name'] = node.xpath(
             './h3/text()').extract_first()  #.extract 作用是将xpath对象转换成为text
         item['title'] = node.xpath('./h4/text()').extract_first()
         item['info'] = node.xpath('./p/text()').extract_first()
         #返回给管道
         #return item
         #给引擎继续爬取
         #return scrapy.Request(url)
         #items.append(item)
         #返回数据给管道,之后会继续执行
         yield item
示例#13
0
    def parse(self, response):
        # print(response)
        node_list = response.xpath('//div[@class="li_txt"]')
        for node in node_list:
            item = ItcastItem()
            # extract()方法将xpath对象转换为Unicode字符串
            name = node.xpath('./h3/text()').extract()
            title = node.xpath('./h4/text()').extract()
            info = node.xpath('./p/text()').extract()

            #xpath取出的是list,转换成str需取下标
            item['name'] = name[0]
            item['title'] = title[0]
            item['info'] = info[0]

            yield item
示例#14
0
    def parse(self, response):
        node_list = response.xpath("//div[@class='li_txt']")
        # 用来存储所有的item字段
        for node in node_list:
            #创建item字段对象,用来存储信息
            item = ItcastItem()
            #.extract()将xpath对象转换为unicode字符串
            name = node.xpath("./h3/text()").extract()
            title = node.xpath("./h4/text()").extract()
            info = node.xpath("./p/text()").extract()

            item['name'] = name[0]
            item['title'] = title[0]
            item['info'] = info[0]
            #返回提取到的每个item数据给管道pipeline文件处理,同时还会继续回来执行后面的代码即for循环
            yield item
        print(response.body)
示例#15
0
    def parse(self, response):
        # 打印响应源码
        # print response.body
        node_list = response.xpath("//div[@class='li_txt']")
        #用来存储所有的item字段
        for node in node_list:
            #创建item字段对象,用来存储信息
            item = ItcastItem()
            # .extract()将xpath对象转换为unicode字符串(与外面稍微有点不一样)
            name = node.xpath("./h3/text()").extract()
            title = node.xpath("./h4/text()").extract()
            info = node.xpath("./p/text()").extract()

            item['name'] = name[0]
            item['title'] = title[0]
            item['info'] = info[0]
            # 返回给管道,处理一条,返回一条
            yield item
示例#16
0
    def parse(self, response):
        # node_list = response.xpath("//div[@class='li_txt']")  # 使用 xpath
        node_list = response.css(".li_txt")
        for node in node_list:
            item = ItcastItem()
            # 使用 xpath
            # item['name'] = node.xpath('./h3/text()').extract()[0]
            # item['title'] = node.xpath('./h4/text()').extract()[0]
            # item['info'] = node.xpath('./p/text()').extract()[0]

            # 使用 css 选择器
            item['name'] = node.css('h3::text').extract()[0]
            item['title'] = node.css('h4::text').extract()[0]
            item['info'] = node.css('p::text').extract()[0]

            print(item['name'])
            print(item['title'])
            print(item['info'])
            yield
示例#17
0
    def parse(self, response):
        # 节点列表
        node_list = response.xpath("//div[@class='li_txt']")

        #用来存储所有的item字段的
        items = []
        for node in node_list:
            # 创建item字段对象,用来存储信息
            item = ItcastItem()
            # .extract() 将xpath对象转换为Unicode字符串
            name = node.xpath("./h3/text()").extract()
            title = node.xpath("./h4/text()").extract()
            info = node.xpath("./p/text()").extract()

            item["name"] = name[0]
            item["title"] = title[0]
            item["info"] = info[0]
            items.append(item)

        return items
示例#18
0
    def parse(self, response):
        print('djh:ItcastSpider...parse()')
        node_list = response.xpath("//div[@class='li_txt']")

        items = []
        for node in node_list[:5]:
            # 创建item对象
            item = ItcastItem()

            # 老师姓名
            item['name'] = node.xpath("./h3/text()").extract()[
                0]  # xpath对象转list
            # 老师职称
            item['title'] = node.xpath("./h4/text()").extract()[
                0]  # xpath对象转list
            # 老师信息
            item['info'] = node.xpath("./p/text()").extract()[
                0]  # xpath对象转list

            items.append(item)

        return items
    def parse(self, response):
        # print (response.body)
        node_list = response.xpath("//div[@class='main_mask']")

        # Store all the item
        # items = []

        for node in node_list:
            # Create item for storing data
            item = ItcastItem()
            # .extract() change xpath object to Unicode String
            name = node.xpath("./h2/text()").extract()
            title = node.xpath("./h2/span/text()").extract()
            info = node.xpath("./p/text()").extract()

            item['name'] = (name[0])
            item['title'] = (title[0])
            item['info'] = (info[0])
            # items.append(item)

            # Return every item data, and Transfer data to pipelines
            # Meanwhile go back an execute the code followed
            yield item
示例#20
0
    def parse(self, response):
        node_list = response.xpath("//div[@class='li_txt']")  #节点
        # items = [] #item列表,用来存储所有的item字段的,后面用到了管道文件所以不用了,不能写到for里面不然只能拿到返回的最后的一个字段名
        for node in node_list:
            #创建item字段对象,用来存储信息,前面引入了一个类,我们要拿到每个返回的字段,所以就在for下面创建
            item = ItcastItem()
            #node.xpath返回的是xpath对象,.extract()将xpath对象转换为的unicode字符串(作用:提取字段并转换为字符串)
            name = node.xpath("./h3/text()").extract(
            )  #等于item['name'] = node.xpath("./h3/text()").extract()[0],这里分开写了
            title = node.xpath("./h4/text()").extract()
            info = node.xpath("./p/text()").extract()

            #xpath返回的是包含一个元素的列表
            # print name[0]
            # print title[0]
            # print info[0]
            item['name'] = name[0]
            item['title'] = title[0]
            item['info'] = info[0]
            # items.append(item) #将返回的item字段放到列表中
            #返回提取到的每个item数据,给管道文件处理,下次执行时会返回到此处继续执行后面的代码,return是直接执行完就关闭了函数
            yield item
        # 直接返回最后数据
        # return items
示例#21
0
    def parse(self, response):
        item_list = []

        node_list = response.xpath(
            "//div[@id='shiti-content']/div[@class='shiti']")
        # print("hello world!")
        # print(response.xpath("//div[@class='shiti-content']/div[@class='shiti']"))
        answer_list = response.xpath(
            "//div[@class='shiti-content']/span/text()").extract()
        # print(len(node_list))
        # print(len(answer_list))
        for i in range(len(node_list)):
            item = ItcastItem()
            option = {}
            # .extract()将xpath对象转化为Unicode字符串
            name = node_list[i].xpath("./h3/text()").extract()[0]
            option_list = node_list[i].xpath("./ul/li/label/text()").extract()
            # print(type(option_list))
            for op in range(len(option_list)):
                option[op] = option_list[op]
            # print(type(option))
            answer = answer_list[i]
            # print(name)
            item["name"] = name
            # print(option)
            item["option"] = option
            # print(answer)
            item["answer"] = answer
            # return item
            # item_list.append(item)
            # 获取url传给调度器
            # return scrapy.Request(url)
            #将获取数据交给 pipelines        避免把所有数据item放入item_list占用大量内存的情况
            #返回数据给管道,处理完毕后再回来取数据
            yield item
        # 返回值就传给引擎
        # return item_list
        #单库爬寻
        # if self.offset < 76:
        #      self.offset += 1
        #      url = self.baseURL+str(self.offset)
        #      print(url)
        #      yield scrapy.Request(url,callback = self.parse)

        #拼接方式:多库爬寻
        #当第一个库爬寻完毕,爬寻下一个库
        print(self.examLib[self.exam_list[self.KuNumber]][1])
        print(len(self.exam_list))
        print(self.KuNumber)
        if (self.offset < self.examLib[self.exam_list[self.KuNumber]][1]
                and self.KuNumber < len(self.exam_list)):
            self.offset += 1
            url = self.baseURL + 'tikubh=' + str(self.examLib[self.exam_list[
                self.KuNumber]][0]) + '&page=' + str(self.offset)
            print(url)
            yield scrapy.Request(url, callback=self.parse)
        elif self.KuNumber < len(self.exam_list):
            self.offset = 1
            self.KuNumber += 1
            url = self.baseURL + 'tikubh=' + str(self.examLib[self.exam_list[
                self.KuNumber]][0]) + '&page=' + str(self.offset)
            print(url)
            yield scrapy.Request(url, callback=self.parse)