Пример #1
0
 def analyse_detail(self, res, address_dict, rest):
     try:
         contents = res['content']
     except:
         log.error('{}没有找到content这个字段'.format(rest.url))
         return
     for poi in contents[:10]:
         try:
             poi_address = poi['addr']
         except:
             log.error('{}没有addr这个字段'.format(rest.url))
             continue
         match_word = address_dict['match_word']
         if match_word is not None:
             if address_dict['match_word'] not in poi_address:
                 continue
             std_tag = poi['std_tag']
             di_tag = poi['di_tag']
             if std_tag is not None and di_tag is not None:
                 tag = std_tag + di_tag
             elif std_tag is None and di_tag is not None:
                 tag = di_tag
             elif std_tag is not None and di_tag is None:
                 tag = std_tag
             else:
                 tag = ''
             if '公司' in tag:
                 try:
                     company_id = poi['primary_uid']
                     company = Company(company_id, self.source)
                 except Exception as e:
                     log.error('{}中无法匹配到company_id'.format(rest.url))
                     continue
                 company_name = poi['name']
                 address = poi['addr']
                 city = address_dict['city']
                 region = address_dict['region']
                 print(company_name, address, city, region)
                 company.company_name = company_name
                 company.address = address
                 company.city = city
                 company.region = region
                 company.insert_db()
Пример #2
0
    def analyze_detail(self, html, company_id, url):
        xpath_html = etree.HTML(html)
        company = Company(company_id=company_id, company_source=self.source)
        #转换为一段json字符串,几乎包含所有的信息
        # company_text = xpath_html.xpath("//script[@id='companyInfoData']/text()")
        # if company_text[0]:
        #     company_text = company_text[0]

        try:
            company_text = xpath_html.xpath(
                "//script[@id='companyInfoData']/text()")[0]
        except:
            return
        company_info = json.loads(company_text)
        # 公司基本信息,包括人数,类型等
        baseinfo = company_info['baseInfo']
        # #地址列表,,里面包含很多地址信息
        # address = company_info['addressList'][0]
        # #里面包含公司基本信息,包括名字、简介等,
        # coreInfo = company_info['coreInfo']
        try:
            address = company_info['addressList'][0]
            company.address = address['detailAddress']  #详细地址
            company.city = address['city']  #城市
            company.company_name = company_info['coreInfo'][
                'companyName']  #公司名称
        except Exception as e:
            log.error('{}缺少必要字段,error={}'.format(url, e))
            return
        #长简介
        if company_info['introduction'].get('companyProfile'):
            company.company_info = company_info['introduction'][
                'companyProfile']
        #短简介
        if company_info['coreInfo'].get('companyIntroduce'):
            # if company_info['coreInfo']['companyIntroduce']:
            company.company_short_info = company_info['coreInfo'][
                'companyIntroduce']
        if baseinfo.get('industryField'):
            company.business = company_info['baseInfo']['industryField']
        if baseinfo.get('financeStage'):
            company.development_stage = company_info['baseInfo'][
                'financeStage']
        if baseinfo.get('companySize'):
            company.company_size = company_info['baseInfo']['companySize']
        #所在区域
        if address.get('district'):
            company.region = address['district']
        company.url = url
        # result = company.serialization_info()
        # # print(result)
        company.insert_db()
 async def detail_parse(self, res):
     html = etree.HTML(res)
     try:
         company_id = re.search('ecomp_id : "(\d+)"', res).group(1)
         company = Company(company_id, self.source)
         company.company_name = html.xpath('//h1/text()')[0]
         company.address = re.search('公司地址:</span>(.*?)</li>', res).group(1)
         company.city = re.search('data-city="(.*?)"', res).group(1)
         if html.xpath("//p[@class='profile']/text()") is not None:
             company.company_info = html.xpath(
                 "//p[@class='profile']/text()")[0].strip()
         if re.search('公司规模:</span>(.*?)</li>', res) is not None:
             company.company_size = re.search('公司规模:</span>(.*?)</li>',
                                              res).group(1)
         if re.search('经营期限:(.*?)</li', res) is not None:
             company.operating_period = re.search('经营期限:(.*?)</li',
                                                  res).group(1)
         if re.search('注册时间:(.*?)</li', res) is not None:
             company.registration_time = re.search('注册时间:(.*?)</li',
                                                   res).group(1)
         if re.search('注册资本:(.*?)</li', res) is not None:
             company.registered_capital = re.search('注册资本:(.*?)</li',
                                                    res).group(1)
         if html.xpath("//a[@class='comp-industry']/text()") is not None:
             company.business = html.xpath(
                 "//a[@class='comp-industry']/text()")[0]
         company.insert_db()
     except Exception as e:
         log.error('解析失败{}'.format(e))
Пример #4
0
    def send_url(self, url):
        try:
            rest = requests.get(url, headers=self.headers, proxies=self.proxies)
        except Exception as e:
            log.error('{}失败,原因是{}'.format(url, e))
            return
        res = rest.text
        html = etree.HTML(res)
        try:
            company_id = re.search('/company/(\d+)/', url).group(1)
            print(company_id)
            company = Company(company_id, self.source)
            company.url = url
        except Exception as e:
            log.error('缺少必要字段,原因{}'.format(e))
            return

        company_name = html.xpath('//h1/text()')
        if len(company_name) > 0:
            company.company_name = company_name[0]
        address = re.search('data-address="(.*?)"', res)
        if address:
            address = address.group(1)
            print(address)
            company.address = address

        city_list = html.xpath('//div[@class="comp-summary-tag"]/a[@class="comp-summary-tag-dq"]/text()')
        if len(city_list) > 0:
            city = city_list[0]
            company.city = city
        if address:
            region = address
            company.region = region

        business_list = html.xpath('//div[@class="comp-summary-tag"]/a[@data-selector="comp-industry"]/text()')
        if len(business_list) > 0:
            company.business = business_list[0]
        a1_xpath = html.xpath('//div[@class="comp-summary-tag"]/a[1]/text()')
        if len(a1_xpath) > 0:
            string = a1_xpath[0]
            if '人' in string:
                company.company_size = string
            else:
                company.development_stage = string
                a2_xpath = html.xpath('//div[@class="comp-summary-tag"]/a[2]/text()')
                if len(a2_xpath) > 0:
                    size = a2_xpath[0]
                    if '人' in size:
                        company.company_size = size

        text_list = html.xpath("//p[@class='profile']/text()")
        if len(text_list) > 0:
            company_info = ','.join(text_list)
            company.company_info = ''.join(company_info.split())

        if re.search('经营期限:(.*?)</li', res) is not None:
            company.operating_period = re.search('经营期限:(.*?)</li', res).group(1)
            # print(company.operating_period)
        if re.search('注册时间:(.*?)</li', res) is not None:
            company.registration_time = re.search('注册时间:(.*?)</li', res).group(1)
            # print(company.registration_time)
        if re.search('注册资本:(.*?)</li', res) is not None:
            company.registered_capital = re.search('注册资本:(.*?)</li', res).group(1)
            # print(company.registered_capital)
        company.insert_db()
Пример #5
0
    def analyze_detail(self, html, company_id, url):
        xpath_html = etree.HTML(html)
        company = Company(company_id=company_id, company_source=self.source)
        company.address = xpath_html.xpath('string(//*[@id="location_container"]/div[2]/div[2])').strip()
        company.company_info = xpath_html.xpath('string(//*[@id="company_intro"])').strip()

        company.company_short_info = xpath_html.xpath('/html/body/div[2]/div/div/div[1]/div/text()')[0].strip()
        company.city = xpath_html.xpath('//*[@id="basic_container"]/div[2]/ul/li[4]/span/text()')[0].strip()
        company.business = xpath_html.xpath('//*[@id="basic_container"]/div[2]/ul/li[1]/span/text()')[0].strip()
        company.development_stage = xpath_html.xpath('//*[@id="basic_container"]/div[2]/ul/li[2]/span/text()')[
            0].strip()
        company.company_name = xpath_html.xpath('/html/body/div[2]/div/div/div[1]/h1/a/text()')[0].strip()
        company.company_size = xpath_html.xpath('//*[@id="basic_container"]/div[2]/ul/li[3]/span/text()')[0].strip()
        company.url = url
        company.insert_db()
    def data_fetch(self, url):

        response = requests.get(url=url,proxies=self.proxies,headers=self.headers)
        print(url)
        if response.status_code == 200:
            response.encoding = 'GBK'
            tree = etree.HTML(response.text)
            company_id = re.search('https://jobs\.51job\.com/all/co(\d+)\.html', url).group(1)
            company_source = '51job'
            company = Company(company_id=company_id,company_source=company_source)
            try:
                address1 = tree.xpath("/html/body/div[2]/div[2]/div[3]/div[2]/div/p/text()")[1]
                address2 = address1.replace(' ', '').replace('\n', '')
                if '(' in address2:
                    address = address2.split('(')[0]
                else:
                    address = address2
                company_info = tree.xpath("//div[@class='con_txt']/text()")[0]
                company_size_business = tree.xpath("//p[@class='ltype']/text()")[0]
                company_size_business = company_size_business.split('|')
                if len(company_size_business) > 2:
                    company_size = company_size_business[1]
                    business = company_size_business[2]
                else:
                    company_size = company_size_business[0]
                    business = company_size_business[1]
                company.address = address
                company.company_info = company_info
                company.business = business
                company.company_size = company_size
            except Exception as e:
                print(e)
            company_name = tree.xpath("//div[@class='tHeader tHCop']/div[1]/h1/text()")[0]
            company.company_id = company_id
            company.company_name = company_name
            company.company_source = company_source
            company.url = url
            company.insert_db()
        else:
            print(response.status_code)