def parse1(self, response): x = HtmlXPathSelector(response) item = LagouItem() item['title']=x.xpath("//dt[@class='clearfix']/h1/text()").extract()[2] item['companyname']=x.xpath("//div[@class='s_txt_jobs']//a[@style='font-size:14px;font-weight:bold;color:#000000;']/text()").extract()[0] item['tag']=x.xpath("//div[@class='s_txt_jobs']//div[@class='jobdetail_divRight_span']").extract() print x.path("//div[@class='s_txt_jobs']//tbody/tr/td/text()").extract() #item['day']=x.xpath("//div[@class='s_txt_jobs']//table[@class='jobs_1']//td[@class='txt_2 '][1]/text()").extract() #item['place']=x.xpath("//div[@class='s_txt_jobs']//table[@class='jobs_1']//td[@class='txt_2 '][2]/text()").extract() #item['cnt']=x.xpath("//div[@class='s_txt_jobs']//table[@class='jobs_1']//td[@class='txt_2 '][3]/text()").extract() #item['jobyear']=x.xpath("//div[@class='s_txt_jobs']//table[@class='jobs_1']//td[@class='txt_2 '][4]/text()").extract() #item['xueli']=x.xpath("//div[@class='s_txt_jobs']//table[@class='jobs_1']//td[@class='txt_2 '][5]/text()").extract() title=x.xpath("//div[@class='s_txt_jobs']//table[@class='jobs_1']//td[@class='txt_1']/text()").extract() nei=x.xpath("//div[@class='s_txt_jobs']//table[@class='jobs_1']//td[@class='txt_2 ']/text()").extract() print title print nei for i,j in zip(title,nei): if i==u'发布日期:': item['day']=j elif i==u'工作地点:': item['place']=j elif i==u'招聘人数:': item['cnt']=j elif i==u'工作年限:': item['jobyear']=j elif i.replace(u'\xa0',u'')==u'学历:': item['xueli']=j elif i==u'薪水范围:': item['money']=j #item['money']=x.xpath("//div[@class='s_txt_jobs']//table[@class='jobs_1']//td[@class='txt_2 jobdetail_xsfw_color ']/text()").extract() detail=x.xpath("//td[@class='txt_4 wordBreakNormal job_detail ']/div").extract()[0] item['jobdetail']='\t'.join(detail) fu=x.xpath("//div[@class='s_txt_jobs']//div[@class='jobdetail_divRight_span']/span[@class='Welfare_label']/text()").extract() if len(fu) > 0: item['fuli']=fu item['url']=response.url next_url=x.xpath("//a[@style='border:0px; width:auto;margin-left:5px;']/@href").extract()[2] seq = Request(next_url,callback=self.parse) yield item
def parse(self, response): hxs = HtmlXPathSelector(response) sites = hxs.path('//fieldset/ul/li') #sites = hxs.path('//ul/li') items = [] for site in sites: item = DmozItem() item['title'] = site.path('a/text()').extract() item['link'] = site.path('a/@href').extract() item['desc'] = site.path('text()').extract() items.append(item) return items
def parse(self,response): # filename=os.path.basename(response.url).split(r'.')[0] # open(filename,'wb').write(response.bady) items=[] hxs=HtmlXPathSelector(response) sites=hxs.path("//td[@id='postmessage_6511893']/a") for site in sites: item=x3cn_Item() item['title']=site.path("text()").extract() item['link']=site.path("@href").extract() items.append(item) return items
def parse(self, response): # filename=os.path.basename(response.url).split(r'.')[0] # open(filename,'wb').write(response.bady) items = [] hxs = HtmlXPathSelector(response) sites = hxs.path("//td[@id='postmessage_6511893']/a") for site in sites: item = x3cn_Item() item['title'] = site.path("text()").extract() item['link'] = site.path("@href").extract() items.append(item) return items