Python Util.match_company示例

编程语言: Python
命名空间/包名称: liepinspiders.util.reutil
类/类型: Util
方法/功能: match_company
hotexamples.com的示例: 1
Python Util.match_company - 已找到1个示例。这些是从开源项目中提取的最受好评的liepinspiders.util.reutil.Util.match_company现实Python示例。您可以评价示例，以帮助我们提高示例质量。
常用方法
显示隐藏
pathnumber_company(2)
match_company(1)
pathnumber_job(1)
示例#1
显示文件
文件： liepinspider.py 项目： zhaoyu20150930/liepincrawl
 def parse(self, response):
     liepinItem = LiepinCompanyItem()
     if Util.match_company(response.url):
         if "company.liepin.com" in response.url:
             normalLogo = response.selector.xpath("//img[@class='normalELogo']")
             bigLogo = response.selector.xpath("//img[@class='bigELogo']")
             bannerLogo = response.selector.xpath("//div[@class='banner']")
             soup=BeautifulSoup(response.body,'lxml')
             if bigLogo:
                 liepinItem['logo'] = soup.find('img',class_='bigELogo')['src']
                 liepinItem['name'] = soup.find('div',class_='company-name').find('h1').get_text().strip()
                 liepinItem['size'] = ""
                 liepinItem['property'] = ""
                 liepinItem['field'] = ""
                 liepinItem['finance'] = ""
                 liepinItem['website'] = ""
                 liepinItem['location'] = ""
                 liepinItem['address'] = soup.find('p',attrs={'data-selector':'company-address'}).get_text().strip()
                 tags = soup.find('section',class_="company-info").find('ul',class_='company-tags clearfix').find_all("li")
                 tag_text=','
                 for tag in tags:
                     tag_text = tag_text+tag.get_text().strip()+","
                 liepinItem['tag']=tag_text[::-1]
                 aboutuls = soup.find('ul',class_='about-list')
                 for ul in aboutuls:
                     if u"领" in ul.get_text().strip():
                         liepinItem['field'] = ul.get_text().strip()[3::]
                     if  u"地" in ul.get_text().strip():
                         liepinItem['location'] = ul.get_text().strip()[3::]
                     if u"官" in ul.get_text().strip():
                         liepinItem['website'] = ul.get_text().strip()[3::]
                     if u"融" in ul.get_text().strip():
                         liepinItem['finance'] = ul.get_text().strip()[3::]
                 liepinItem['intro'] = ""
                 liepinItem['companyid'] = Util.pathnumber_company(response.url)
                 sadd("imperfect",liepinItem['companyid'])
             elif normalLogo:
                 liepinItem['logo'] = soup.find('section',class_="introduction").find('img',class_='normalELogo')['src']
                 liepinItem['name'] = soup.find('section',class_="introduction").find('div',class_='einfo').find('h2').get_text().strip()
                 liepinItem['size'] = soup.find('section',class_="introduction").find('div',class_='e-menu').find('ul',class_='clearfix').find_all('li')[0].get_text().strip()
                 liepinItem['property'] = soup.find('section',class_="introduction").find('div',class_='e-menu').find('ul',class_='clearfix').find_all('li')[1].get_text().strip()
                 liepinItem['location'] = soup.find('section',class_="introduction").find('div',class_='e-menu').find('ul',class_='clearfix').find_all('li')[2].get_text().strip()
                 tag_list = soup.find('section',class_="introduction").find('div',class_='tag-list clearfix')
                 tag_text=''
                 if tag_list:
                     tags = tag_list.find_all('span',class_='tag')
                     for tag in tags:
                         tag_text = tag_text+tag.get_text().strip()+","
                 liepinItem['tag']=tag_text[::-1]
                 liepinItem['field']=''
                 liepinItem['finance'] = ""
                 liepinItem['website'] = ""
                 introdiv = soup.find('div',class_='intro-main')
                 liepinItem['intro'] = ""
                 liepinItem['address'] = ""
                 if introdiv:
                     liepinItem['intro'] = introdiv
                 if soup.find('p',class_='company-address'):
                     liepinItem['address'] = soup.find('p',class_='company-address').get_text().strip()
                 liepinItem['companyid'] = Util.pathnumber_company(response.url)
             elif bannerLogo:
                 liepinItem['logo'] = ""
                 liepinItem['name'] = ""
                 liepinItem['size'] = ""
                 liepinItem['property'] = ""
                 liepinItem['location'] = ""
                 liepinItem['tag'] = ""
                 liepinItem['field']=''
                 liepinItem['finance'] = ""
                 liepinItem['website'] = ""
                 liepinItem['intro'] = ""
                 liepinItem['address'] = ""
                 liepinItem['companyid'] = Util.pathnumber_company(response.url)
                 sadd("imperfect",liepinItem['companyid'])
             if int(liepinItem['companyid']) > 100000:
                 yield liepinItem
         elif "vip.liepin.com" in response.url:
             liepinItem['logo'] = ""
             liepinItem['name'] = ""
             liepinItem['size'] = ""
             liepinItem['property'] = ""
             liepinItem['location'] = ""
             liepinItem['tag'] = ""
             liepinItem['field'] = ''
             liepinItem['finance'] = ""
             liepinItem['website'] = ""
             liepinItem['intro'] = ""
             liepinItem['address'] = ""
             liepinItem['companyid'] = Util.pathnumber_vip(response.url)
             sadd("imperfect",liepinItem['companyid'])
     urls = response.selector.xpath("//a/@href[contains(.,'liepin')]").extract()
     for url in urls:
         if "m." not in url:
             if "http" in url:
                 yield Request(url,callback = self.parse)
             else:
                 yield Request("http://"+url,callback = self.parse)