def parse(self,response): # with open("techer.html","w") as f: # f.write(response.body) # 通过scrapy 自带的xpath匹配出所有老师的根节点列表集合 techer_list = response.xpath('//div[@class="li_txt"]') # 所有老师信息的列表集合 teacherItems = [] # 遍历根节点集合 for each in techer_list: # 实例化一个item 用来保存数据的 item = ItcastItem() # 将xpath对象 转换程unicode对象 extract() # extract() 将匹配出来的结果 转换程 Uniode字符串 # 不加extract() 结果为xpath匹配对象 name = each.xpath('./h3/text()').extract() title = each.xpath('./h4/text()').extract() info = each.xpath('./p/text()').extract() #print name[0] #print title[0] #print info[0] # 存储数据 item['name'] = name[0] item['title'] = title[0] item['info'] = info[0] teacherItems.append(item) return teacherItems
def parse(self, response): #open("teacher.html","wb").write(response.body).close() # 存放老师信息的集合 #items = [] for each in response.xpath("//div[@class='li_txt']"): # 将我们得到的数据封装到一个 `ItcastItem` 对象 item = ItcastItem() #extract()方法返回的都是unicode字符串 name = each.xpath("h3/text()").extract() title = each.xpath("h4/text()").extract() info = each.xpath("p/text()").extract() #xpath返回的是包含一个元素的列表 item['name'] = name[0] item['title'] = title[0] item['info'] = info[0] #items.append(item) #将获取的数据交给pipelines yield item # 返回数据,不经过pipeline #return items
def parse(self, response): #filename = 'teacher.html' #open(filename,'w').write(response.body) items = [] for each in response.xpath('//div[@class="li_txt"]'): #将得到的数据封装到一个ItcastItem对象 item = ItcastItem() #extract()方法返回的都是unicode对象 name = each.xpath('h3/text()').extract() title = each.xpath('h4/text()').extract() info = each.xpath('p/text()').extract() #xpath返回的是包含一个元素的列表 #如果这里采用utf-8编码,在pipelines处理的时候json.dumps(dict(item),ensure_ascii=False)就会报错 item['name'] = name[0].encode('utf-8') item['title'] = title[0].encode('utf-8') item['info'] = info[0].encode('utf-8') ''' #以下报错,说明scrapy.item是一个类似与字典的对象 item.name = name[0] item.title = title[0] item.info = info[0] items.append(item) ''' #return items yield item
def parse(self, response): # with open('teacher2.html','w') as f: # 注意是body,不是read # f.write(response.body) # 所有老师的信息集合 # teacherItem = [] # 通过scrapy自带的xpath匹配出所有老师的根节点 # 遍历根节点集合 for each in response.xpath("//div[@class='li_txt']"): # item 对象用来保存数据的 item = ItcastItem() # 不加.extract()结果为xpath匹配的对象列表 # extract()将匹配出来的结果转换成unicode字符串 # name name = each.xpath("./h3/text()").extract() # title title = each.xpath("./h4/text()").extract() # info info = each.xpath("./p/text()").extract() # print name[0] # print title[0] # print info[0] # item['name'] = name[0].encode('gbk') item['name'] = name[0] item['title'] = title[0] item['info'] = info[0] # teacherItem.append(item) yield item
def parse(self, response): #open("teacher.html","wb").write(response.body).close() print("@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@") items = [] filename = "55.html" data = response.body.decode() position_pat = re.compile('html">.*?</a>.*\n.*') list = position_pat.finditer(data) for i in list: #m = re.search('(?<=.html">)', i) m = i.group() m = m.split(">") M = m[1] M = M[:-3] n = m[-1] n = n[15:] item = ItcastItem() item['name'] = M item['remise'] = n items.append(item) yield item print("第{0}页爬取完成".format(self.offset)) if self.offset < 1: #爬前几页 self.offset += 1 url2 = ("https://www.55haitao.com/store/list/0-8-0-0-all-" + str(self.offset) + ".html") yield scrapy.Request(url=url2, callback=self.parse)
def parse(self, response): # with open("teacher.html", "w") as f: # f.write(response.body) # 通过scrapy自带的xpath匹配出所有老师的根节点列表信息集合 teacher_list = response.xpath('//div[@class="li_txt"]') # 所有老师信息的列表集合 teacherIten = [] # 遍历根节点集合 for each in teacher_list: # Item对象用来保存数据的 item = ItcastItem() # name, extract()将匹配出来的结果转换为Unicode字符串 # 不加extract() 结果为xpath对象 name = each.xpath('./h3/text()').extract() # level level = each.xpath('./h4/text()').extract() # info info = each.xpath('./p/text()').extract() # print name[0], level[0], info[0] item["name"] = name[0] item["level"] = level[0] item["info"] = info[0] #将结果返回给pipelines yield item
def parse(self, response): # pass filename = "teacher.html" # open(filename,'w').write(response.body).close() # 存放数据 items = [] for each in response.xpath("//div[@class='li_txt']"): # # 将我们得到的数据封装到一个 `ItcastItem` 对象 item = ItcastItem() #extract()方法返回的都是unicode字符串 name = each.xpath("h3/text()").extract() # name = each.xpath("h3/text()").extract() title = each.xpath("h4/text()").extract() info = each.xpath("p/text()").extract() # xpath返回的是包含一个元素的列表 item['name'] = name[0] item['title'] = title[0] item['info'] = info[0] # items.append(item) yield item
def parse(self, response): # with open('teacher.html', 'w') as f: # f.write(response.body) teacher_list = response.xpath("//div[@class='li_txt']") # treacherItem = [] for each in teacher_list: # 将我们得到的数据封装到一个 `ItcastItem` 对象 item = ItcastItem() # name.extract() 將匹配的结果转化为Unicode 字符串 # 不加 extract() 结果为xpath 匹配对象 name = each.xpath("./h3/text()").extract() # title title = each.xpath("./h4/text()").extract() # info info = each.xpath("./p/text()").extract() item['name'] = name[0].encode('utf-8') item['title'] = title[0].encode('utf-8') item['info'] = info[0].encode('utf-8') # treacherItem.append(item) #将获取的数据交给pipelines 管道文件 yield item
def parse(self, response): # 下载网页源码 # with open("teacher.html", "w") as f: # f.write(response.body) # 通过scrapy自带xpath获取里面的节点 teacher_list = response.xpath('//div[@class="li_txt"]') # teacherItem = [] #遍历节点集合, for each in teacher_list: # 实例化对象用来保存数据 item = ItcastItem() # 名字 ,extract()转unicode字符串 name = each.xpath('./h3/text()').extract() # title title = each.xpath('./h4/text()').extract() # info info = each.xpath('./p/text()').extract() # item['name'] = name[0].encode('gbk') # item['title'] = title[0].encode('gbk') # item['info'] = info[0].encode('gbk') item['name'] = name[0] item['title'] = title[0] item['info'] = info[0] yield item
def parse(self, response): # with open("teacher.html", "w") as f: # f.write(response.body) # 通过scrapy自带的xpath匹配出所有老师的根节点列表集合 teacher_list = response.xpath('//div[@class="li_txt"]') # 所有老师信息的列表集合 # teacherItem = [] # 遍历根节点集合 for each in teacher_list: # Item对象用来保存数据的 item = ItcastItem() # name, extract() 将匹配出来的结果转换为Unicode字符串 # 不加extract() 结果为xpath匹配对象 name = each.xpath('./h3/text()').extract() # title title = each.xpath('./h4/text()').extract() # info info = each.xpath('./p/text()').extract() # item['name'] = name[0].encode("gbk") # item['title'] = title[0].encode("gbk") # item['info'] = info[0].encode("gbk") item['name'] = name[0] item['title'] = title[0] item['info'] = info[0] yield item
def parse(self, response): # 通过scrapy自带的xpath匹配出所有老师的根节点列表集合 for each in response.xpath("//tr[@class='even'] | //tr[@class='odd']"): # Item对象用来保存数据的 item = ItcastItem() item['positionname'] = each.xpath("./td[1]/a/text()").extract()[0] # 详情连接 item['positionlink'] = each.xpath("./td[1]/a/@href").extract()[0] # 职位类别 item['positionType'] = each.xpath("./td[2]/text()").extract()[0] # 招聘人数 item['peopleNum'] = each.xpath("./td[3]/text()").extract()[0] # 工作地点 item['workLocation'] = each.xpath("./td[4]/text()").extract()[0] # 发布时间 item['publishTime'] = each.xpath("./td[5]/text()").extract()[0] yield item # 当爬到数据少于2438条时,每爬一次加10条 if self.offset < 2438: self.offset += 10 # 加完10条后,重新爬取数据 yield scrapy.Request(self.url + str(self.offset), callback=self.parse)
def parse(self, response): # 通过scrapy自带的xpath匹配出所有老师的根节点列表集合 teacher_list = response.xpath('//div[@class="li_txt"]') # 所有老师信息的列表集合 #teacherItem = [] # 遍历根节点集合 # 遍历根节点集合 for each in teacher_list: # Item对象用来保存数据的 item = ItcastItem() # 不加extract() 结果为xpath匹配对象 name = each.xpath('./h3/text()').extract() # title title = each.xpath('./h4/text()').extract() # info info = each.xpath('./p/text()').extract() item['name'] = name[0] item['title'] = title[0] item['info'] = info[0] #利用yield #将获取的数据交给管道 pipelines文件 yield item
def parse(self, response): # with open('teacher.html','w') as f : # f.write(response.body) teacher_list = response.xpath('//div[@class="li_txt"]') items = [] # 遍历根节点集合 for each in teacher_list: item = ItcastItem() # 实例 item对象保存数据 # 将匹配的内容(匹配对象)转为Unicode字符串 name = each.xpath('./h3/text()').extract() title = each.xpath('./h4/text()').extract() info = each.xpath('./p/text()').extract() # print name[0] # print title[0] # print info[0] item['name'] = name[0] item['title'] = title[0] item['info'] = info[0] # items.append(item) # 将获得的值交给pipeline yield item
def parse(self, response): #with open('teacher.html','w') as f: # f.write(response.body) #通过scrapy自带的xpath匹配出所有老师的根节点 teacher_list = response.xpath('//div[@class="li_txt"]') #所有老师的信息列表集合 #teacherItem = [] #遍历根节点集合 for each in teacher_list: #item对象用来保存数据 item = ItcastItem() #extract()将匹配出来的结果转换为unicode字符串 #不加extract(),结果为xpath匹配对象 name = each.xpath('./h3/text()').extract() title = each.xpath('./h4/text()').extract() info = each.xpath('./p/text()').extract() item['name'] = name[0] item['title'] = title[0] item['info'] = info[0] yield item #teacherItem.append(item) #print(name[0]) #print(title[0]) #print(info[0]) #return teacherItem
def parse(self, response): # open("teacher.html","wb").write(response.body).close() # 存放老师信息的集合 items = [] # XPath 提取数据 for each in response.xpath( "//div[@class='li_txt']" ): # /html/body/div[1]/div[5]/div[2]/div[4]/ul/li[1]/div[2] # 将我们得到的数据封装到一个 `ItcastItem` 对象 item = ItcastItem() # extract()方法返回的都是unicode字符串 name = each.xpath("h3/text()").extract() title = each.xpath("h4/text()").extract() info = each.xpath("p/text()").extract() # xpath返回的是包含一个元素的列表 item['name'] = name[0] item['title'] = title[0] item['info'] = info[0] items.append(item) # 直接返回最后数据 return items
def parse(self, response): # 全部老师的集合 items = [] #file_name = "teacher.html" #open(file_name, "w").write(response.body) for site in response.xpath('//div[@class="li_txt"]'): #一个老师的数据 item = ItcastItem() teacher_name = site.xpath('h3/text()').extract() teacher_level = site.xpath('h4/text()').extract() teacher_info = site.xpath('p/text()').extract() print teacher_name[0] print teacher_level[0] print teacher_info[0] print "=====================" item['name'] = teacher_name[0] item['level'] = teacher_level[0] item['info'] = teacher_info[0] items.append(item) return items
def parse2(self, response): # filename="bdnews.html" # open(filename,'wb').write(response.body) print("{1}.***.{0}".format(datetime.datetime.now(), self.count)) # print(response.body) item = ItcastItem() item['name'] = 'a' item['title'] = 'b' item['info'] = 'c' yield item
def parse(self, response): # filename = "teacher.html" # open(filename, 'w').write(response.body.decode('utf-8')) items = [] for each in response.xpath("//div[@class='li_txt']"): item = ItcastItem() name = each.xpath("h3/text()").extract() item['name'] = name[0] items.append(item) yield items
def parse(self, response): # filename = 'teacher.html' # open(filename,'w').write(response.body) items = [] for each in response.xpath("//div[@class='li_txt']"): item = ItcastItem() name = each.xpath('h3/text()').extract()[0] level = each.xpath('h4/text()').extract()[0] info = each.xpath('p/text()').extract()[0] items.append(item) return items
def parse(self, response): teacher_list = response.xpath('//div[@class="main_rpicR"]') for each in teacher_list: item = ItcastItem() name = each.xpath('./h3/a/text()').extract() info = each.xpath('.//p[position()=1]/text()').extract() time = each.xpath('.//p[position()=2]/text()').extract() item["name"] = name[0] item["info"] = info[0] item["time"] = time[0] yield item
def parse(self, response): node_list = response.xpath("//div[@class='li_txt']") for node in node_list: item = ItcastItem() name = node.xpath("./h3/text()").extract() title = node.xpath("./h4/text()").extract() info = node.xpath("./p/text()").extract() item['name'] = name[0] item['title'] = title[0] item['info'] = info[0] yield item
def parse(self, response): # with open("teacher.html","w") as f: # f.write(response.body) teacher_list = response.xpath('//div[@class="li_txt"]') teacherItem = [] for each in teacher_list: item = ItcastItem() name = each.xpath('./h3/text()').extract() title = each.xpath('./h4/text()').extract() info = each.xpath('./p/text()').extract() item["name"] = name[0] item["title"] = title[0] item["info"] = info[0] yield item
def parse(self, response): #filename = "teacher.html" #open(filename, "wb").write(response.body) items = [] for each in response.xpath("//div[@class='li_txt']"): item = ItcastItem() name = each.xpath("h3/text()").extract() title = each.xpath("h4/text()").extract() info = each.xpath("p/text()").extract() item['name'] = name[0] item['title'] = title[0] item['info'] = info[0] items.append(item) return items
def parse(self, response): items = [] for each in response.xpath("//div[@class='li_txt']"): item = ItcastItem() # extract()方法返回的都是unicode字符串 name = each.xpath("h3/text()").extract() title = each.xpath("h4/text()").extract() info = each.xpath("p/text()").extract() item['name'] = name[0] item['title'] = title[0] item['info'] = info[0] #items.append(item) yield item
def parse(self, response): # 存放老师的集合 items = [] li_all = response.xpath("//div[@class='li_txt']") for li_one in li_all: item = ItcastItem() name = li_one.xpath("h3/text()").extract_first() level = li_one.xpath("h4/text()").extract_first() info = li_one.xpath("p/text()").extract_first() item["name"] = name item["level"] = level item["info"] = info items.append(item) # 将获取的数据交给pipelines yield item
def parse(self, response): # 存放老师信息的集合 #items = [] for each in response.xpath('//div[@class="li_txt"]'): # 将得到的数据封装到‘ItcastItem’对象 item = ItcastItem() # extract()方法返回的都是unicode字符串 name = each.xpath('h3/text()')[0].extract() title = each.xpath('h4/text()')[0].extract() info = each.xpath('p/text()')[0].extract() # xpath返回的是包含一个元素的列表 item['name'] = name item['title'] = title item['info'] = info #items.append(item) yield item
def parse_content(self, response): items = [] print(response.text) exit() # for each in response.xpath("//div[@class='li_txt']"): # print(response.xpath("//div[@class='li_txt']")) # exit() for i in response.xpath("//div[@class='li_txt']"): item = ItcastItem() name = i.xpath('h3/text()').extract() title = i.xpath('h4/text()').extract() info = i.xpath('p/text()').extract() item['name'] = name[0] item['title'] = title[0] item['info'] = info[0] yield item
def parse(self, response): # filename = "teachers.html" # with open(filename,'w') as f: # f.write(response.body) # print(response.body) items = [] node_list = response.xpath("//div[@class='li_txt']") for node in node_list: item = ItcastItem() item['name'] = node.xpath("./h3/text()").extract()[0] item['level'] = node.xpath("./h4/text()").extract()[0] item['info'] = node.xpath("./p/text()").extract()[0] items.append(item) return items
def parse(self, response): # with open("teacher.html", "w") as f: # f.write(response.body) # teacher_item = [] name_list = response.xpath("//div[@class='li_txt']") for teacher in name_list: teacher_name = teacher.xpath("./h3/text()").extract() teacher_title = teacher.xpath("./h4/text()").extract() teacher_info = teacher.xpath("./p/text()").extract() # print(teacher_name[0] + teacher_title[0] + teacher_info[0]) item = ItcastItem() item["name"] = teacher_name[0] item["title"] = teacher_title[0] item["info"] = teacher_info[0] yield item # teacher_item.append(item) # return teacher_item
def parse(self, response): items = [] for each in response.xpath("//div[@class='li_txt']"): # 将我们得到的数据封装到一个 `ItcastItem` 对象 item = ItcastItem() # extract()方法返回的都是unicode字符串 name = each.xpath("h3/text()").extract() title = each.xpath("h4/text()").extract() info = each.xpath("p/text()").extract() # xpath返回的是包含一个元素的列表 item['name'] = name[0] item['title'] = title[0] item['info'] = info[0] # items.append(item) # 将抓取的数据交给pipeline yield item