Пример #1
0
	def parse(self,response):
	#	with open("techer.html","w") as f:
	#		f.write(response.body)

	# 通过scrapy 自带的xpath匹配出所有老师的根节点列表集合
		techer_list = response.xpath('//div[@class="li_txt"]')
		# 所有老师信息的列表集合
		teacherItems = []
		# 遍历根节点集合
		for each in techer_list:
			# 实例化一个item 用来保存数据的
			item = ItcastItem() 
			# 将xpath对象 转换程unicode对象  extract()
			# extract() 将匹配出来的结果 转换程 Uniode字符串
			# 不加extract() 结果为xpath匹配对象
			name = each.xpath('./h3/text()').extract()
			title = each.xpath('./h4/text()').extract()
			info = each.xpath('./p/text()').extract()
			
			#print name[0]
			#print title[0]
			#print info[0]

			# 存储数据
			item['name'] = name[0]
			item['title'] = title[0]
			item['info'] = info[0]
			teacherItems.append(item)

		return teacherItems

			
Пример #2
0
	def parse(self, response):
	    #open("teacher.html","wb").write(response.body).close()

	    # 存放老师信息的集合
	    #items = []

	    for each in response.xpath("//div[@class='li_txt']"):
	        # 将我们得到的数据封装到一个 `ItcastItem` 对象
	        item = ItcastItem()
	        #extract()方法返回的都是unicode字符串
	        name = each.xpath("h3/text()").extract()
	        title = each.xpath("h4/text()").extract()
	        info = each.xpath("p/text()").extract()

	        #xpath返回的是包含一个元素的列表
	        item['name'] = name[0]
	        item['title'] = title[0]
	        item['info'] = info[0]

	        #items.append(item)

	        #将获取的数据交给pipelines
	        yield item

	    # 返回数据,不经过pipeline
	    #return items
Пример #3
0
    def parse(self, response):
        #filename = 'teacher.html'
        #open(filename,'w').write(response.body)

        items = []

        for each in response.xpath('//div[@class="li_txt"]'):
            #将得到的数据封装到一个ItcastItem对象
            item = ItcastItem()

            #extract()方法返回的都是unicode对象
            name = each.xpath('h3/text()').extract()
            title = each.xpath('h4/text()').extract()
            info = each.xpath('p/text()').extract()

            #xpath返回的是包含一个元素的列表
            #如果这里采用utf-8编码,在pipelines处理的时候json.dumps(dict(item),ensure_ascii=False)就会报错
            item['name'] = name[0].encode('utf-8')
            item['title'] = title[0].encode('utf-8')
            item['info'] = info[0].encode('utf-8')
            '''
            #以下报错,说明scrapy.item是一个类似与字典的对象
            item.name = name[0]
            item.title = title[0]
            item.info = info[0]
            items.append(item)
            '''
            #return items
            yield item
Пример #4
0
    def parse(self, response):
        # with open('teacher2.html','w') as f:
        # 注意是body,不是read
        # f.write(response.body)
        # 所有老师的信息集合
        # teacherItem = []
        # 通过scrapy自带的xpath匹配出所有老师的根节点
        # 遍历根节点集合
        for each in response.xpath("//div[@class='li_txt']"):

            # item 对象用来保存数据的
            item = ItcastItem()
            # 不加.extract()结果为xpath匹配的对象列表
            # extract()将匹配出来的结果转换成unicode字符串
            # name
            name = each.xpath("./h3/text()").extract()
            # title
            title = each.xpath("./h4/text()").extract()
            # info
            info = each.xpath("./p/text()").extract()

            # print name[0]
            # print title[0]
            # print info[0]

            # item['name'] = name[0].encode('gbk')
            item['name'] = name[0]
            item['title'] = title[0]
            item['info'] = info[0]
            # teacherItem.append(item)

            yield item
Пример #5
0
    def parse(self, response):
        #open("teacher.html","wb").write(response.body).close()
        print("@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@")
        items = []
        filename = "55.html"
        data = response.body.decode()
        position_pat = re.compile('html">.*?</a>.*\n.*')
        list = position_pat.finditer(data)

        for i in list:
            #m = re.search('(?<=.html">)', i)
            m = i.group()
            m = m.split(">")
            M = m[1]
            M = M[:-3]
            n = m[-1]
            n = n[15:]
            item = ItcastItem()
            item['name'] = M
            item['remise'] = n
            items.append(item)
            yield item

        print("第{0}页爬取完成".format(self.offset))
        if self.offset < 1:  #爬前几页
            self.offset += 1
            url2 = ("https://www.55haitao.com/store/list/0-8-0-0-all-" +
                    str(self.offset) + ".html")
            yield scrapy.Request(url=url2, callback=self.parse)
Пример #6
0
    def parse(self, response):
        # with open("teacher.html", "w") as f:
        #     f.write(response.body)
        # 通过scrapy自带的xpath匹配出所有老师的根节点列表信息集合
        teacher_list = response.xpath('//div[@class="li_txt"]')
        # 所有老师信息的列表集合
        teacherIten = []

        # 遍历根节点集合
        for each in teacher_list:

            # Item对象用来保存数据的
            item = ItcastItem()
            # name, extract()将匹配出来的结果转换为Unicode字符串
            # 不加extract() 结果为xpath对象
            name = each.xpath('./h3/text()').extract()
            # level
            level = each.xpath('./h4/text()').extract()
            # info
            info = each.xpath('./p/text()').extract()
            # print name[0], level[0], info[0]
            item["name"] = name[0]
            item["level"] = level[0]
            item["info"] = info[0]

            #将结果返回给pipelines
            yield item
Пример #7
0
    def parse(self, response):
        # pass
        filename = "teacher.html"
        # open(filename,'w').write(response.body).close()

        # 存放数据
        items = []

        for each in response.xpath("//div[@class='li_txt']"):
            # # 将我们得到的数据封装到一个 `ItcastItem` 对象
            item = ItcastItem()
            #extract()方法返回的都是unicode字符串
            name = each.xpath("h3/text()").extract()
            # name = each.xpath("h3/text()").extract()
            title = each.xpath("h4/text()").extract()
            info = each.xpath("p/text()").extract()

            # xpath返回的是包含一个元素的列表

            item['name'] = name[0]
            item['title'] = title[0]
            item['info'] = info[0]

            # items.append(item)
            yield item
Пример #8
0
	def parse(self, response):
		# with open('teacher.html', 'w') as f:
		# 	f.write(response.body)
		teacher_list = response.xpath("//div[@class='li_txt']")

		# treacherItem = []
		for each in teacher_list:
			# 将我们得到的数据封装到一个 `ItcastItem` 对象
			item = ItcastItem()
    		# name.extract() 將匹配的结果转化为Unicode 字符串
			# 不加 extract() 结果为xpath 匹配对象
 			name = each.xpath("./h3/text()").extract()
			# title
			title = each.xpath("./h4/text()").extract()
			# info
			info = each.xpath("./p/text()").extract()

			item['name'] = name[0].encode('utf-8')
			item['title'] = title[0].encode('utf-8')
			item['info'] = info[0].encode('utf-8')

			# treacherItem.append(item)

			#将获取的数据交给pipelines 管道文件
			yield item
Пример #9
0
    def parse(self, response):
        # 下载网页源码
        # with open("teacher.html", "w") as f:
        #     f.write(response.body)
        # 通过scrapy自带xpath获取里面的节点
        teacher_list = response.xpath('//div[@class="li_txt"]')
        # teacherItem = []
        #遍历节点集合,
        for each in teacher_list:
            # 实例化对象用来保存数据
            item = ItcastItem()
            # 名字 ,extract()转unicode字符串
            name = each.xpath('./h3/text()').extract()
            # title
            title = each.xpath('./h4/text()').extract()
            # info
            info = each.xpath('./p/text()').extract()

            # item['name'] = name[0].encode('gbk')
            # item['title'] = title[0].encode('gbk')
            # item['info'] = info[0].encode('gbk')
            item['name'] = name[0]
            item['title'] = title[0]
            item['info'] = info[0]
            yield item
Пример #10
0
    def parse(self, response):
        # with open("teacher.html", "w") as f:
        #    f.write(response.body)
        # 通过scrapy自带的xpath匹配出所有老师的根节点列表集合
        teacher_list = response.xpath('//div[@class="li_txt"]')

        # 所有老师信息的列表集合
        # teacherItem = []
        # 遍历根节点集合
        for each in teacher_list:
            # Item对象用来保存数据的
            item = ItcastItem()
            # name, extract() 将匹配出来的结果转换为Unicode字符串
            # 不加extract() 结果为xpath匹配对象
            name = each.xpath('./h3/text()').extract()
            # title
            title = each.xpath('./h4/text()').extract()
            # info
            info = each.xpath('./p/text()').extract()

            # item['name'] = name[0].encode("gbk")
            # item['title'] = title[0].encode("gbk")
            # item['info'] = info[0].encode("gbk")

            item['name'] = name[0]
            item['title'] = title[0]
            item['info'] = info[0]

            yield item
Пример #11
0
    def parse(self, response):

        # 通过scrapy自带的xpath匹配出所有老师的根节点列表集合
        for each in response.xpath("//tr[@class='even'] | //tr[@class='odd']"):

            # Item对象用来保存数据的
            item = ItcastItem()

            item['positionname'] = each.xpath("./td[1]/a/text()").extract()[0]
            # 详情连接
            item['positionlink'] = each.xpath("./td[1]/a/@href").extract()[0]
            # 职位类别
            item['positionType'] = each.xpath("./td[2]/text()").extract()[0]
            # 招聘人数
            item['peopleNum'] = each.xpath("./td[3]/text()").extract()[0]
            # 工作地点
            item['workLocation'] = each.xpath("./td[4]/text()").extract()[0]
            # 发布时间
            item['publishTime'] = each.xpath("./td[5]/text()").extract()[0]

            yield item

            # 当爬到数据少于2438条时,每爬一次加10条
            if self.offset < 2438:
                self.offset += 10

            # 加完10条后,重新爬取数据
            yield scrapy.Request(self.url + str(self.offset),
                                 callback=self.parse)
Пример #12
0
    def parse(self, response):
        # 通过scrapy自带的xpath匹配出所有老师的根节点列表集合
        teacher_list = response.xpath('//div[@class="li_txt"]')

        # 所有老师信息的列表集合
        #teacherItem = []
        # 遍历根节点集合
        # 遍历根节点集合
        for each in teacher_list:

            # Item对象用来保存数据的
            item = ItcastItem()
            # 不加extract() 结果为xpath匹配对象
            name = each.xpath('./h3/text()').extract()
            # title
            title = each.xpath('./h4/text()').extract()
            # info
            info = each.xpath('./p/text()').extract()

            item['name'] = name[0]
            item['title'] = title[0]
            item['info'] = info[0]

            #利用yield
            #将获取的数据交给管道 pipelines文件
            yield item
Пример #13
0
    def parse(self, response):
        # with open('teacher.html','w') as f :
        #     f.write(response.body)
        teacher_list = response.xpath('//div[@class="li_txt"]')
        items = []
        # 遍历根节点集合
        for each in teacher_list:
            item = ItcastItem()
            # 实例 item对象保存数据
            # 将匹配的内容(匹配对象)转为Unicode字符串
            name = each.xpath('./h3/text()').extract()
            title = each.xpath('./h4/text()').extract()
            info = each.xpath('./p/text()').extract()

            # print name[0]
            # print title[0]
            # print info[0]
            item['name'] = name[0]
            item['title'] = title[0]
            item['info'] = info[0]

            # items.append(item)

            # 将获得的值交给pipeline
            yield item
Пример #14
0
    def parse(self, response):
        #with open('teacher.html','w') as f:
        #   f.write(response.body)
        #通过scrapy自带的xpath匹配出所有老师的根节点
        teacher_list = response.xpath('//div[@class="li_txt"]')

        #所有老师的信息列表集合
        #teacherItem = []
        #遍历根节点集合
        for each in teacher_list:

            #item对象用来保存数据
            item = ItcastItem()
            #extract()将匹配出来的结果转换为unicode字符串
            #不加extract(),结果为xpath匹配对象
            name = each.xpath('./h3/text()').extract()
            title = each.xpath('./h4/text()').extract()
            info = each.xpath('./p/text()').extract()

            item['name'] = name[0]
            item['title'] = title[0]
            item['info'] = info[0]

            yield item


            #teacherItem.append(item)

            #print(name[0])
            #print(title[0])
            #print(info[0])
        #return teacherItem
Пример #15
0
    def parse(self, response):
        # open("teacher.html","wb").write(response.body).close()

        # 存放老师信息的集合
        items = []

        # XPath 提取数据
        for each in response.xpath(
                "//div[@class='li_txt']"
        ):  # /html/body/div[1]/div[5]/div[2]/div[4]/ul/li[1]/div[2]
            # 将我们得到的数据封装到一个 `ItcastItem` 对象
            item = ItcastItem()
            # extract()方法返回的都是unicode字符串
            name = each.xpath("h3/text()").extract()
            title = each.xpath("h4/text()").extract()
            info = each.xpath("p/text()").extract()

            # xpath返回的是包含一个元素的列表
            item['name'] = name[0]
            item['title'] = title[0]
            item['info'] = info[0]

            items.append(item)

        # 直接返回最后数据

        return items
Пример #16
0
    def parse(self, response):

        # 全部老师的集合
        items = []

        #file_name = "teacher.html"
        #open(file_name, "w").write(response.body)
        for site in response.xpath('//div[@class="li_txt"]'):

            #一个老师的数据
            item = ItcastItem()
            
            teacher_name = site.xpath('h3/text()').extract()
            teacher_level = site.xpath('h4/text()').extract()
            teacher_info = site.xpath('p/text()').extract()

            print teacher_name[0]
            print teacher_level[0]
            print teacher_info[0]
            print "====================="
        
            item['name'] = teacher_name[0]
            item['level'] = teacher_level[0]
            item['info'] = teacher_info[0]

            items.append(item)

        return items
Пример #17
0
 def parse2(self, response):
     # filename="bdnews.html"
     # open(filename,'wb').write(response.body)
     print("{1}.***.{0}".format(datetime.datetime.now(), self.count))
     # print(response.body)
     item = ItcastItem()
     item['name'] = 'a'
     item['title'] = 'b'
     item['info'] = 'c'
     yield item
Пример #18
0
 def parse(self, response):
     # filename = "teacher.html"
     # open(filename, 'w').write(response.body.decode('utf-8'))
     items = []
     for each in response.xpath("//div[@class='li_txt']"):
         item = ItcastItem()
         name = each.xpath("h3/text()").extract()
         item['name'] = name[0]
         items.append(item)
     yield items
Пример #19
0
 def parse(self, response):
     # filename = 'teacher.html'
     # open(filename,'w').write(response.body)
     items = []
     for each in response.xpath("//div[@class='li_txt']"):
         item = ItcastItem()
         name = each.xpath('h3/text()').extract()[0]
         level = each.xpath('h4/text()').extract()[0]
         info = each.xpath('p/text()').extract()[0]
         items.append(item)
     return items
Пример #20
0
 def parse(self, response):
     teacher_list = response.xpath('//div[@class="main_rpicR"]')
     for each in teacher_list:
         item = ItcastItem()
         name = each.xpath('./h3/a/text()').extract()
         info = each.xpath('.//p[position()=1]/text()').extract()
         time = each.xpath('.//p[position()=2]/text()').extract()
         item["name"] = name[0]
         item["info"] = info[0]
         item["time"] = time[0]
         yield item
Пример #21
0
    def parse(self, response):
        node_list = response.xpath("//div[@class='li_txt']")

        for node in node_list:
            item = ItcastItem()

            name = node.xpath("./h3/text()").extract()
            title = node.xpath("./h4/text()").extract()
            info = node.xpath("./p/text()").extract()

            item['name'] = name[0]
            item['title'] = title[0]
            item['info'] = info[0]

            yield item
Пример #22
0
    def parse(self, response):
        # with open("teacher.html","w") as f:
        #     f.write(response.body)
        teacher_list = response.xpath('//div[@class="li_txt"]')
        teacherItem = []
        for each in teacher_list:

            item = ItcastItem()

            name = each.xpath('./h3/text()').extract()
            title = each.xpath('./h4/text()').extract()
            info = each.xpath('./p/text()').extract()
            item["name"] = name[0]
            item["title"] = title[0]
            item["info"] = info[0]
            yield item
Пример #23
0
	def parse(self, response):
		#filename = "teacher.html"
		#open(filename, "wb").write(response.body)
		items = []
		for each in response.xpath("//div[@class='li_txt']"):
			item = ItcastItem()
			name = each.xpath("h3/text()").extract()
			title = each.xpath("h4/text()").extract()
			info = each.xpath("p/text()").extract()

			item['name'] = name[0]
			item['title'] = title[0]
			item['info'] = info[0]

			items.append(item)
		return items
Пример #24
0
    def parse(self, response):
        items = []
        for each in response.xpath("//div[@class='li_txt']"):
            item = ItcastItem()

            # extract()方法返回的都是unicode字符串
            name = each.xpath("h3/text()").extract()
            title = each.xpath("h4/text()").extract()
            info = each.xpath("p/text()").extract()

            item['name'] = name[0]
            item['title'] = title[0]
            item['info'] = info[0]

            #items.append(item)
            yield item
Пример #25
0
    def parse(self, response):
        # 存放老师的集合
        items = []
        li_all = response.xpath("//div[@class='li_txt']")
        for li_one in li_all:
            item = ItcastItem()

            name = li_one.xpath("h3/text()").extract_first()
            level = li_one.xpath("h4/text()").extract_first()
            info = li_one.xpath("p/text()").extract_first()

            item["name"] = name
            item["level"] = level
            item["info"] = info
            items.append(item)
            # 将获取的数据交给pipelines
            yield item
Пример #26
0
    def parse(self, response):
        # 存放老师信息的集合
        #items = []
        for each in response.xpath('//div[@class="li_txt"]'):
            # 将得到的数据封装到‘ItcastItem’对象
            item = ItcastItem()
            # extract()方法返回的都是unicode字符串
            name = each.xpath('h3/text()')[0].extract()
            title = each.xpath('h4/text()')[0].extract()
            info = each.xpath('p/text()')[0].extract()
            # xpath返回的是包含一个元素的列表
            item['name'] = name
            item['title'] = title
            item['info'] = info

            #items.append(item)
            yield item
Пример #27
0
    def parse_content(self, response):
        items = []
        print(response.text)
        exit()
        # for each in response.xpath("//div[@class='li_txt']"):
        # print(response.xpath("//div[@class='li_txt']"))
        # exit()
        for i in response.xpath("//div[@class='li_txt']"):
            item = ItcastItem()

            name = i.xpath('h3/text()').extract()
            title = i.xpath('h4/text()').extract()
            info = i.xpath('p/text()').extract()

            item['name'] = name[0]
            item['title'] = title[0]
            item['info'] = info[0]
            yield item
Пример #28
0
    def parse(self, response):
        #        filename = "teachers.html"
        #        with open(filename,'w') as f:
        #            f.write(response.body)
        #        print(response.body)

        items = []
        node_list = response.xpath("//div[@class='li_txt']")
        for node in node_list:
            item = ItcastItem()

            item['name'] = node.xpath("./h3/text()").extract()[0]
            item['level'] = node.xpath("./h4/text()").extract()[0]
            item['info'] = node.xpath("./p/text()").extract()[0]

            items.append(item)

        return items
Пример #29
0
    def parse(self, response):
        # with open("teacher.html", "w") as f:
        # 	f.write(response.body)
        # teacher_item = []
        name_list = response.xpath("//div[@class='li_txt']")
        for teacher in name_list:
            teacher_name = teacher.xpath("./h3/text()").extract()
            teacher_title = teacher.xpath("./h4/text()").extract()
            teacher_info = teacher.xpath("./p/text()").extract()
            # print(teacher_name[0] + teacher_title[0] + teacher_info[0])
            item = ItcastItem()
            item["name"] = teacher_name[0]
            item["title"] = teacher_title[0]
            item["info"] = teacher_info[0]

        yield item
        # teacher_item.append(item)
        # return teacher_item
Пример #30
0
    def parse(self, response):

        items = []

        for each in response.xpath("//div[@class='li_txt']"):
            # 将我们得到的数据封装到一个 `ItcastItem` 对象
            item = ItcastItem()
            # extract()方法返回的都是unicode字符串
            name = each.xpath("h3/text()").extract()
            title = each.xpath("h4/text()").extract()
            info = each.xpath("p/text()").extract()

            # xpath返回的是包含一个元素的列表
            item['name'] = name[0]
            item['title'] = title[0]
            item['info'] = info[0]
            # items.append(item)

            # 将抓取的数据交给pipeline
            yield item