async def detail_parse(self, res):
     html = etree.HTML(res)
     try:
         company_id = re.search('ecomp_id : "(\d+)"', res).group(1)
         company = Company(company_id, self.source)
         company.company_name = html.xpath('//h1/text()')[0]
         company.address = re.search('公司地址:</span>(.*?)</li>', res).group(1)
         company.city = re.search('data-city="(.*?)"', res).group(1)
         if html.xpath("//p[@class='profile']/text()") is not None:
             company.company_info = html.xpath(
                 "//p[@class='profile']/text()")[0].strip()
         if re.search('公司规模:</span>(.*?)</li>', res) is not None:
             company.company_size = re.search('公司规模:</span>(.*?)</li>',
                                              res).group(1)
         if re.search('经营期限:(.*?)</li', res) is not None:
             company.operating_period = re.search('经营期限:(.*?)</li',
                                                  res).group(1)
         if re.search('注册时间:(.*?)</li', res) is not None:
             company.registration_time = re.search('注册时间:(.*?)</li',
                                                   res).group(1)
         if re.search('注册资本:(.*?)</li', res) is not None:
             company.registered_capital = re.search('注册资本:(.*?)</li',
                                                    res).group(1)
         if html.xpath("//a[@class='comp-industry']/text()") is not None:
             company.business = html.xpath(
                 "//a[@class='comp-industry']/text()")[0]
         company.insert_db()
     except Exception as e:
         log.error('解析失败{}'.format(e))
Exemplo n.º 2
0
    def send_url(self, url):
        try:
            rest = requests.get(url, headers=self.headers, proxies=self.proxies)
        except Exception as e:
            log.error('{}失败,原因是{}'.format(url, e))
            return
        res = rest.text
        html = etree.HTML(res)
        try:
            company_id = re.search('/company/(\d+)/', url).group(1)
            print(company_id)
            company = Company(company_id, self.source)
            company.url = url
        except Exception as e:
            log.error('缺少必要字段,原因{}'.format(e))
            return

        company_name = html.xpath('//h1/text()')
        if len(company_name) > 0:
            company.company_name = company_name[0]
        address = re.search('data-address="(.*?)"', res)
        if address:
            address = address.group(1)
            print(address)
            company.address = address

        city_list = html.xpath('//div[@class="comp-summary-tag"]/a[@class="comp-summary-tag-dq"]/text()')
        if len(city_list) > 0:
            city = city_list[0]
            company.city = city
        if address:
            region = address
            company.region = region

        business_list = html.xpath('//div[@class="comp-summary-tag"]/a[@data-selector="comp-industry"]/text()')
        if len(business_list) > 0:
            company.business = business_list[0]
        a1_xpath = html.xpath('//div[@class="comp-summary-tag"]/a[1]/text()')
        if len(a1_xpath) > 0:
            string = a1_xpath[0]
            if '人' in string:
                company.company_size = string
            else:
                company.development_stage = string
                a2_xpath = html.xpath('//div[@class="comp-summary-tag"]/a[2]/text()')
                if len(a2_xpath) > 0:
                    size = a2_xpath[0]
                    if '人' in size:
                        company.company_size = size

        text_list = html.xpath("//p[@class='profile']/text()")
        if len(text_list) > 0:
            company_info = ','.join(text_list)
            company.company_info = ''.join(company_info.split())

        if re.search('经营期限:(.*?)</li', res) is not None:
            company.operating_period = re.search('经营期限:(.*?)</li', res).group(1)
            # print(company.operating_period)
        if re.search('注册时间:(.*?)</li', res) is not None:
            company.registration_time = re.search('注册时间:(.*?)</li', res).group(1)
            # print(company.registration_time)
        if re.search('注册资本:(.*?)</li', res) is not None:
            company.registered_capital = re.search('注册资本:(.*?)</li', res).group(1)
            # print(company.registered_capital)
        company.insert_db()