def parse(self, response):
     if "job.liepin.com" in response.url:
         soup=BeautifulSoup(response.body,'lxml')
         if not soup.find('div','title-info  over'):
             liepinItem = LiepinJobItem()
             liepinItem['name'] = soup.find('div',class_='title-info').find('h1').get_text().strip()
             liepinItem['salary'] = soup.find('p',class_='job-main-title').get_text().strip().split()[0].strip()
             liepinItem['jobid'] = Util.pathnumber_job(response.url)
             liepinItem['companyid'] = Util.pathnumber_company(soup.find('div',class_='right-post-top').find('a')['href'])
             liepinItem['location'] = soup.find('p',class_='basic-infor').find('span').get_text().strip()
             spantext = ''
             resumespan = soup.find('div',class_='resume clearfix')
             if resumespan :
                 spans = resumespan.find_all('span')
                 for span in spans:
                     spantext = spantext+span.get_text()+" "
             liepinItem['require'] = spantext
             divtags = soup.find('div',class_='tag-list clearfix')
             tags = []
             if divtags:
                 tags = divtags.find_all('span')
             tagtext = ''
             for tag in tags:
                 tagtext = tagtext + tag.get_text().strip() + ','
             liepinItem['tag']=tagtext
             liepinItem['jobdes'] = soup.find('div',class_='content content-word')
             lis = []
             jobmaindiv = soup.find_all('div',class_='job-main main-message ')
             if jobmaindiv.__len__() > 1:
                 lis = jobmaindiv[1].find_all('li')
             liepinItem['partment'] = ''
             liepinItem['major'] = ''
             liepinItem['report']=''
             liepinItem['sub']=''
             liepinItem['sex'] =''
             for li in lis:
                 if u'所' in li.find('span').get_text().strip():
                     liepinItem['partment']=li.get_text().split()[0].strip()[5::].strip()
                 if u'专' in li.find('span').get_text().strip():
                     liepinItem['major']=li.get_text().split()[0].strip()[5::].strip()
                 if u'汇' in li.find('span').get_text().strip():
                     liepinItem['report']=li.get_text().split()[0].strip()[5::].strip()
                 if u'下' in li.find('span').get_text().strip():
                     liepinItem['sub']=li.get_text().split()[0].strip()[5::].strip()
                 if u'性' in li.find('span').get_text().strip():
                     liepinItem['sex']=li.get_text().split()[0].strip()[5::].strip()
             yield liepinItem
     urls = response.selector.xpath("//a/@href[contains(.,'liepin')]").extract()
     for url in urls:
          if "m." not in url:
             if "http" in url:
                 yield Request(url,callback = self.parse)
             else:
                 yield Request("http://"+url,callback = self.parse)
 def parse(self, response):
     liepinItem = LiepinCompanyItem()
     if Util.match_company(response.url):
         if "company.liepin.com" in response.url:
             normalLogo = response.selector.xpath("//img[@class='normalELogo']")
             bigLogo = response.selector.xpath("//img[@class='bigELogo']")
             bannerLogo = response.selector.xpath("//div[@class='banner']")
             soup=BeautifulSoup(response.body,'lxml')
             if bigLogo:
                 liepinItem['logo'] = soup.find('img',class_='bigELogo')['src']
                 liepinItem['name'] = soup.find('div',class_='company-name').find('h1').get_text().strip()
                 liepinItem['size'] = ""
                 liepinItem['property'] = ""
                 liepinItem['field'] = ""
                 liepinItem['finance'] = ""
                 liepinItem['website'] = ""
                 liepinItem['location'] = ""
                 liepinItem['address'] = soup.find('p',attrs={'data-selector':'company-address'}).get_text().strip()
                 tags = soup.find('section',class_="company-info").find('ul',class_='company-tags clearfix').find_all("li")
                 tag_text=','
                 for tag in tags:
                     tag_text = tag_text+tag.get_text().strip()+","
                 liepinItem['tag']=tag_text[::-1]
                 aboutuls = soup.find('ul',class_='about-list')
                 for ul in aboutuls:
                     if u"领" in ul.get_text().strip():
                         liepinItem['field'] = ul.get_text().strip()[3::]
                     if  u"地" in ul.get_text().strip():
                         liepinItem['location'] = ul.get_text().strip()[3::]
                     if u"官" in ul.get_text().strip():
                         liepinItem['website'] = ul.get_text().strip()[3::]
                     if u"融" in ul.get_text().strip():
                         liepinItem['finance'] = ul.get_text().strip()[3::]
                 liepinItem['intro'] = ""
                 liepinItem['companyid'] = Util.pathnumber_company(response.url)
                 sadd("imperfect",liepinItem['companyid'])
             elif normalLogo:
                 liepinItem['logo'] = soup.find('section',class_="introduction").find('img',class_='normalELogo')['src']
                 liepinItem['name'] = soup.find('section',class_="introduction").find('div',class_='einfo').find('h2').get_text().strip()
                 liepinItem['size'] = soup.find('section',class_="introduction").find('div',class_='e-menu').find('ul',class_='clearfix').find_all('li')[0].get_text().strip()
                 liepinItem['property'] = soup.find('section',class_="introduction").find('div',class_='e-menu').find('ul',class_='clearfix').find_all('li')[1].get_text().strip()
                 liepinItem['location'] = soup.find('section',class_="introduction").find('div',class_='e-menu').find('ul',class_='clearfix').find_all('li')[2].get_text().strip()
                 tag_list = soup.find('section',class_="introduction").find('div',class_='tag-list clearfix')
                 tag_text=''
                 if tag_list:
                     tags = tag_list.find_all('span',class_='tag')
                     for tag in tags:
                         tag_text = tag_text+tag.get_text().strip()+","
                 liepinItem['tag']=tag_text[::-1]
                 liepinItem['field']=''
                 liepinItem['finance'] = ""
                 liepinItem['website'] = ""
                 introdiv = soup.find('div',class_='intro-main')
                 liepinItem['intro'] = ""
                 liepinItem['address'] = ""
                 if introdiv:
                     liepinItem['intro'] = introdiv
                 if soup.find('p',class_='company-address'):
                     liepinItem['address'] = soup.find('p',class_='company-address').get_text().strip()
                 liepinItem['companyid'] = Util.pathnumber_company(response.url)
             elif bannerLogo:
                 liepinItem['logo'] = ""
                 liepinItem['name'] = ""
                 liepinItem['size'] = ""
                 liepinItem['property'] = ""
                 liepinItem['location'] = ""
                 liepinItem['tag'] = ""
                 liepinItem['field']=''
                 liepinItem['finance'] = ""
                 liepinItem['website'] = ""
                 liepinItem['intro'] = ""
                 liepinItem['address'] = ""
                 liepinItem['companyid'] = Util.pathnumber_company(response.url)
                 sadd("imperfect",liepinItem['companyid'])
             if int(liepinItem['companyid']) > 100000:
                 yield liepinItem
         elif "vip.liepin.com" in response.url:
             liepinItem['logo'] = ""
             liepinItem['name'] = ""
             liepinItem['size'] = ""
             liepinItem['property'] = ""
             liepinItem['location'] = ""
             liepinItem['tag'] = ""
             liepinItem['field'] = ''
             liepinItem['finance'] = ""
             liepinItem['website'] = ""
             liepinItem['intro'] = ""
             liepinItem['address'] = ""
             liepinItem['companyid'] = Util.pathnumber_vip(response.url)
             sadd("imperfect",liepinItem['companyid'])
     urls = response.selector.xpath("//a/@href[contains(.,'liepin')]").extract()
     for url in urls:
         if "m." not in url:
             if "http" in url:
                 yield Request(url,callback = self.parse)
             else:
                 yield Request("http://"+url,callback = self.parse)