예제 #1
0
    def get_pagination(self, key):
        min_page = 0
        max_page = 5
        if not key:
            return min_page, max_page

        if API_MODE == 'tyc':
            return min_page, max_page
        elif API_MODE == 'pro':
            url = '%s/p%s?key=%s' % (TYC_PRO_SEARCH_API, '0', parse.quote(key))
            is_ok, search_resp = api_get(url=url,
                                         headers=self.headers,
                                         data={},
                                         resptype='text')

            soup = BeautifulSoup(search_resp, 'lxml')
            search_pagination = soup.find_all('div',
                                              class_='search-pagination')

            def while_req(url):
                sub_is_ok, sub_search_resp = api_get(url=url,
                                                     headers=self.headers,
                                                     data={},
                                                     resptype='text')
                return sub_is_ok, sub_search_resp

            # 添加手动验证功能
            if len(search_pagination) == 0 or not is_ok:
                while 1:
                    if is_ok and len(search_pagination) > 0:
                        break
                    else:
                        LOG.critical('验证############### %s ###############' %
                                     url)
                        random_sleep(20, 25)
                        is_ok, search_resp = while_req(url)
                        soup = BeautifulSoup(search_resp, 'lxml')
                        search_pagination = soup.find_all(
                            'div', class_='search-pagination')

            l = len(search_pagination[0].find_all('a'))
            for index_a, a in enumerate(search_pagination[0].find_all('a')):
                if index_a == (l - 2):
                    max_page = a.string.strip()
                    if max_page.find('...') > -1:
                        max_page = max_page.split('...')[1]
                        if isinstance(max_page, str):
                            max_page = int(max_page)
                    break
            LOG.info('[%s] pagination max: %s' % (key, max_page))
            return min_page, max_page
예제 #2
0
    def work_by_key(self, key):
        print(key, '@' * 100)
        ret_res = list()
        if not key:
            LOG.error("【%s】key is null, no work." % RUN_MODE)
            return ret_res

        # page
        is_page = False
        for ct in range(9):
            url = '%s/p%s?key=%s' % (TYC_SEARCH_API, 1, parse.quote(key))
            is_ok, search_resp = api_get(url=url,
                                         headers=self.headers,
                                         data={},
                                         resptype='text')
            self.headers['Cookie'] = cookies_get()
            if is_ok:
                is_page = True
                break
        page_vlas = 200
        if not is_page:
            page_vlas = 200
        else:
            et_late = etree.HTML(search_resp)
            page_num = [
                i.xpath('./li/a/text()')[-2] for i in et_late.xpath(
                    '//div[@class="result-footer"]/div[@class=" search-pager"]/ul'
                )
            ]
            if page_num:
                page_vlas = str(page_num[0]).replace('.', '')

        LOG.critical(f'搜索关键词为:{key}, 总页面:{page_vlas}------------------------')
        print(f'搜索关键词为:{key}, 总页面:{page_vlas}------------------------')
        # 公司列表
        for page in range(1, int(page_vlas), 1):
            self.headers['Cookie'] = cookies_get()
            url = '%s/p%s?key=%s' % (TYC_SEARCH_API, page, parse.quote(key))
            print(url, 'Q' * 80)
            is_ok, search_resp = api_get(url=url,
                                         headers=self.headers,
                                         data={},
                                         resptype='text')
            if not is_ok:
                continue
            soup = BeautifulSoup(search_resp, 'lxml')
            tags = soup.find_all(
                'a', attrs={"tyc-event-ch": "CompanySearch.Company"})

            def while_req(url):
                self.headers['Cookie'] = cookies_get()
                sub_is_ok, sub_search_resp = api_get(url=url,
                                                     headers=self.headers,
                                                     data={},
                                                     resptype='text')
                return sub_is_ok, sub_search_resp

            HTNL = etree.HTML(search_resp)
            print(
                HTNL.xpath(
                    '//*[@id="web-content"]/div/div[1]/div[3]/div[2]/div[1]/div/div[3]/div[1]/a/text()'
                ), 'A' * 80)

            # 添加手动验证功能
            if len(tags) == 0:
                while 1:
                    if is_ok and len(tags) > 0:
                        break
                    else:
                        print(url)
                        LOG.critical('验证############### %s ###############' %
                                     url)
                        random_sleep(20, 25)
                        self.headers['Cookie'] = cookies_get()
                        is_ok, search_resp = while_req(url)
                        soup = BeautifulSoup(search_resp, 'lxml')
                        tags = soup.find_all(
                            'a',
                            attrs={"tyc-event-ch": "CompanySearch.Company"})
            eto = etree.HTML(search_resp)
            user_name = eto.xpath('//div[@nav-type="user"]/a/text()')

            is_success = False
            for i in range(9):
                if not ''.join(user_name):
                    self.headers['Cookie'] = cookies_get()
                    is_ok, search_resp = while_req(url)
                    soup = BeautifulSoup(search_resp, 'lxml')
                    tags = soup.find_all(
                        'a', attrs={"tyc-event-ch": "CompanySearch.Company"})
                    is_success = True
                    break
            if is_success:
                for tag in tags:
                    if not tag or not tag.attrs.get('href'):
                        continue

                    res_dict = dict()
                    res_dict['tyt_url'] = tag.get('href').strip()
                    res_dict['name'] = tag.get_text().strip()
                    res_dict['company_id'] = str(
                        tag.get('href')).split('/')[-1]
                    res_dict['label_index'] = str(key)
                    res_dict['rquest_url'] = url
                    res_dict['source'] = '天眼查'
                    res_dict['created_time'] = str(datetime.now())
                    result = _insert(res_dict)
                    if result.get('status', False):
                        c_id = str(result.get('_id'))
                        try:
                            # detail_res = self.detail_by_url(res_dict.get('tyt_url'))

                            self.detail_by_url(res_dict.get('tyt_url'), c_id)
                        except:
                            try:
                                self.detail_by_url(res_dict.get('tyt_url'),
                                                   c_id)
                            except:
                                pass

                    ret_res.append(res_dict)
                    random_sleep(1, 2.5)
            #     break
            # break
        return ret_res
예제 #3
0
    def detail_by_url(self, comp_url: str, obj_id: str):
        print(self.count, comp_url, obj_id, '$' * 80)
        detail_res = dict()
        if not comp_url:
            return detail_res

        is_ok, search_resp = api_get(url=comp_url,
                                     headers=self.headers,
                                     data={},
                                     resptype='text')
        if not is_ok:
            return detail_res

        soup = BeautifulSoup(search_resp, 'lxml')

        # header: 详情页 公司名称
        title_list = soup.find_all('div', class_="header")
        et2 = etree.HTML(search_resp)
        # if not title_list:
        #     return -1
        try:
            company_name = (title_list[0].find_all(
                'h1', class_="name"))[0].get_text()
        except:
            name = et2.xpath(
                '//*[@id="company_web_top"]/div[2]/div[3]/div[1]/h1/text()')
            company_name = ''.join(name)
        detail_res['company_name'] = company_name

        # 电话 更多联系方式
        # print(et2.xpath('//*[@id="company_web_top"]/div[2]/div[3]/div[3]/div[1]/div[1]/span[3]/script/text()'), 'OK '*80)
        origin_phone = et2.xpath(
            '//*[@id="company_web_top"]/div[2]/div[3]/div[3]/div[1]/div[1]/span[3]/script/text()'
        )

        # 邮箱 更多邮箱
        # print(et2.xpath('//*[@id="company_web_top"]/div[2]/div[3]/div[3]/div[1]/div[2]/span[3]/script/text()'), 'EMAIL '*80)
        origin_email = et2.xpath(
            '//*[@id="company_web_top"]/div[2]/div[3]/div[3]/div[1]/div[2]/span[3]/script/text()'
        )

        if origin_phone and origin_email:
            year_list = [i.get('showSource') for i in eval(origin_phone[0])]
            phone_item_vals = [
                i.get('phoneNumber') for i in eval(origin_phone[0])
            ]
            email_list = eval(origin_email[0])
            contact_item = {}
            for contact in zip(year_list, phone_item_vals, email_list):
                contact_item['c_id'] = obj_id
                contact_item['company_name'] = detail_res.get(
                    'company_name', '')
                contact_item['report_year'] = contact[0]
                contact_item['phone'] = contact[1]
                contact_item['email'] = contact[-1]
                contact_item['date_time'] = datetime.now()
                bixao_phone_emial.find_one_and_update({'c_id': obj_id},
                                                      {'$set': contact_item},
                                                      upsert=True)

        # detail: 电话 邮箱 公司官网 地址 简介
        detail_div = soup.find_all('div', class_="detail")

        def while_req(url):
            sub_is_ok, sub_search_resp = api_get(url=url,
                                                 headers=self.headers,
                                                 data={},
                                                 resptype='text')
            return sub_is_ok, sub_search_resp

        # 添加手动验证功能
        if not detail_div:
            while 1:
                if is_ok and detail_div:
                    break
                else:
                    LOG.critical('验证############### %s ###############' %
                                 comp_url)
                    random_sleep(20, 25)
                    self.headers['Cookie'] = cookies_get()
                    is_ok, search_resp = while_req(comp_url)
                    soup = BeautifulSoup(search_resp, 'lxml')
                    detail_div = soup.find_all('div', class_="detail")

        for div in detail_div[0].find_all('div'):
            if not div:
                continue

            # f0 电话 && 邮箱
            if div.get('class') == ['f0']:
                for big_index, big_child in enumerate(div):
                    if big_index == 0:
                        for index, child in enumerate(big_child.children):
                            if index == 1:
                                detail_res['phone'] = child.get_text().strip(
                                ) or '-'
                                break
                    elif big_index == 1:
                        for index, child in enumerate(big_child.children):
                            if index == 1:
                                detail_res['email'] = child.get_text().strip(
                                ) or '-'
                                break
                    else:
                        break
            # 公司官网 && 地址
            elif div.get('class') == ['f0', 'clearfix']:
                for big_index, big_child in enumerate(div):
                    if big_index == 0:
                        for index, child in enumerate(big_child.children):
                            if index == 1:
                                detail_res['company_url'] = child.get_text(
                                ).strip() or '-'
                                break
                    elif big_index == 1:
                        for index, child in enumerate(big_child.children):
                            if index == 1:
                                for small_index, small_child in enumerate(
                                        child.children):
                                    if small_index == 0:
                                        detail_res[
                                            'address'] = small_child.get_text(
                                            ).strip() or '-'
                                        break
                                break
                    else:
                        break
            # 简介
            elif div.get('class') == ['summary']:
                for big_index, big_child in enumerate(div):
                    if big_index == 0:
                        resume = big_child.string
                        if resume:
                            resume = resume.strip()
                        detail_res['resume'] = resume or '-'
                        break
                    else:
                        break
            else:
                continue

        # detail-list:
        detail_list_div = soup.find_all('div', class_="detail-list")
        if not detail_list_div:
            return detail_res

        detail_res['c_id'] = obj_id
        etc = etree.HTML(search_resp)
        for div in detail_list_div[0].find_all('div'):
            if not div:
                continue

            if div.get('tyc-event-ch'
                       ) == 'CompangyDetail.gongshangxinxin':  # 工商信息
                for index_1, child_1 in enumerate(
                        div.find_all('div', recursive=False)):
                    if index_1 == 1:
                        for index_1_1, child_1_1 in enumerate(child_1):
                            if index_1_1 == 2:
                                for index_tr, tr in enumerate(
                                        child_1_1.find_all('tr')):
                                    if index_tr == 0:
                                        for index_td, td in enumerate(
                                                tr.find_all('td')):
                                            if index_td == 1:  # 注册资本
                                                detail_res[
                                                    'register_funds'] = td.get_text(
                                                    ).strip() or '-'
                                            elif index_td == 3:  # 实缴资金
                                                detail_res[
                                                    'paidin_funds'] = td.get_text(
                                                    ).strip() or '-'
                                    elif index_tr == 1:
                                        for index_td, td in enumerate(
                                                tr.find_all('td')):
                                            if index_td == 1:  # 成立日期
                                                detail_res[
                                                    'establish_date'] = td.get_text(
                                                    ).strip() or '-'
                                            elif index_td == 3:  # 经营状态
                                                detail_res[
                                                    'status'] = td.get_text(
                                                    ).strip() or '-'
                                    elif index_tr == 2:
                                        for index_td, td in enumerate(
                                                tr.find_all('td')):
                                            if index_td == 1:  # 统一社会信用代码
                                                detail_res[
                                                    'credit_code'] = td.get_text(
                                                    ).strip() or '-'
                                            elif index_td == 3:  # 工商注册号
                                                detail_res[
                                                    'registration_number'] = td.get_text(
                                                    ).strip() or '-'
                                    elif index_tr == 3:
                                        for index_td, td in enumerate(
                                                tr.find_all('td')):
                                            if index_td == 1:  # 纳税人识别号
                                                detail_res[
                                                    'identification_number'] = td.get_text(
                                                    ).strip() or '-'
                                            elif index_td == 3:  # 组织机构代码
                                                detail_res[
                                                    'organization_code'] = td.get_text(
                                                    ).strip() or '-'
                                    elif index_tr == 4:
                                        for index_td, td in enumerate(
                                                tr.find_all('td')):
                                            if index_td == 1:  # 公司类型
                                                detail_res[
                                                    'company_type'] = td.get_text(
                                                    ).strip() or '-'
                                            elif index_td == 3:  # 行业
                                                detail_res[
                                                    'industry'] = td.get_text(
                                                    ).strip() or '-'
                                    elif index_tr == 6:
                                        for index_td, td in enumerate(
                                                tr.find_all('td')):
                                            if index_td == 1:  # 营业期限
                                                detail_res[
                                                    'business_term'] = td.get_text(
                                                    ).strip() or '-'
                                            elif index_td == 3:  # 纳税人资质
                                                detail_res[
                                                    'taxpayer_qualification'] = td.get_text(
                                                    ).strip() or '-'
                                    elif index_tr == 7:
                                        for index_td, td in enumerate(
                                                tr.find_all('td')):
                                            if index_td == 1:  # 人员规模
                                                detail_res[
                                                    'personnel_size'] = td.get_text(
                                                    ).strip() or '-'
                                            elif index_td == 3:  # 参保人数
                                                detail_res[
                                                    'insured_num'] = td.get_text(
                                                    ).strip() or '-'
                                    elif index_tr == 9:
                                        for index_td, td in enumerate(
                                                tr.find_all('td')):
                                            if index_td == 1:  # 注册地址
                                                detail_res[
                                                    'registered_address'] = td.get_text(
                                                    ).strip() or '-'
                                    elif index_tr == 10:
                                        for index_td, td in enumerate(
                                                tr.find_all('td')):
                                            if index_td == 1:  # 经营范围
                                                detail_res[
                                                    'business_scope'] = td.get_text(
                                                    ).strip() or '-'

                        break
                continue

            elif div.get(
                    'tyc-event-ch') == 'CompangyDetail.zhuyaorenyuan':  # 主要人员
                people_item = {}
                people_item['c_id'] = obj_id
                people_item['company_name'] = detail_res.get(
                    'company_name', '')
                # 姓名
                people_item['name'] = etc.xpath(
                    '//*[@id="_container_staff"]/div/table/tbody/tr[1]/td[2]/table/tbody/tr/td[2]/a/text()'
                )[0]
                # 职位
                people_item['position'] = etc.xpath(
                    '//*[@id="_container_staff"]/div/table/tbody/tr[1]/td[3]/span/text()'
                )[0]
                bixiao_people.find_one_and_update({'c_id': obj_id},
                                                  {'$set': people_item},
                                                  upsert=True)
                print(people_item)
                for people_vals in people_item:
                    if not people_item[people_vals]:
                        LOG.info(f'主要人员数据匹配异常:{people_item}, 请求地址:{comp_url}')

            elif div.get(
                    'tyc-event-ch') == 'CompangyDetail.gudongxinxi':  # 股东信息
                capital_item = {}
                capital_item['c_id'] = obj_id
                capital_item['company_name'] = detail_res.get(
                    'company_name', '')
                # 股东名称
                title = etc.xpath(
                    '//*[@id="_container_holder"]/table/tbody/tr[1]/td[2]/table/tbody/tr/td[2]/a/text()'
                )
                # 标签
                label = etc.xpath(
                    '//*[@id="_container_holder"]/table/tbody/tr[1]/td[2]/table/tbody/tr/td[2]/div/span/text()'
                )
                # 持股比例
                has_rates = etc.xpath(
                    '//*[@id="_container_holder"]/table/tbody/tr[1]/td[3]/div/div/span/text()'
                )
                # 认缴出资额
                subscribed_capital = etc.xpath(
                    '//*[@id="_container_holder"]/table/tbody/tr[1]/td[4]/div/span/text()'
                )

                capital_item['title'] = ''.join(title)
                capital_item['label'] = ''.join(label)
                capital_item['has_rates'] = ''.join(has_rates)
                capital_item['subscribed_capital'] = ''.join(
                    subscribed_capital)
                bixiao_shareholder.find_one_and_update({'c_id': obj_id},
                                                       {'$set': capital_item},
                                                       upsert=True)
                print(capital_item, 'C' * 80)

            elif div.get(
                    'tyc-event-ch') == 'CompangyDetail.findNewsCount':  # 新闻舆情
                news_item = {}
                news_item['c_id'] = obj_id
                news_item['company_name'] = detail_res.get('company_name', '')
                # 标题
                news_item['title'] = etc.xpath(
                    '//*[@id="_container_findNewsCount"]/div[1]/div[1]/div[1]/div[1]/a/text()'
                )[0]
                # 内容地址
                news_item['info_url'] = etc.xpath(
                    '//*[@id="_container_findNewsCount"]/div[1]/div[1]/div[1]/div[1]/a/@href'
                )[0]
                # 来源
                news_item['source'] = etc.xpath(
                    '//*[@id="_container_findNewsCount"]/div[1]/div[1]/div[1]/div[3]/span[1]/text()'
                )[0]
                # 发布时间
                news_item['date_doc'] = etc.xpath(
                    '//*[@id="_container_findNewsCount"]/div[1]/div[1]/div[1]/div[3]/span[2]/text()'
                )[0]
                print(news_item)
                bixiao_news.update({'c_id': obj_id}, {'$set': news_item},
                                   upsert=True)
                for news_vals in news_item:
                    if not news_item[news_vals]:
                        LOG.info(f'新闻舆情数据匹配异常:{news_item}, 请求地址:{comp_url}')

            elif div.get('tyc-event-ch') == 'CompangyDetail.chanpin':  # 产品信息
                product_item = {}
                product_item['c_id'] = obj_id
                product_item['company_name'] = detail_res.get(
                    'company_name', '')
                # 产品名称
                product_item['name'] = etc.xpath(
                    '//*[@id="_container_product"]/table/tbody/tr[1]/td[2]/table'
                    '/tbody/tr/td[2]/span/text()')[0]
                # 产品简称
                product_item['short_name'] = etc.xpath(
                    '//*[@id="_container_product"]/table/tbody/tr[1]/td[3]'
                    '/span/text()')[0]
                # 产品分类
                product_item['type'] = etc.xpath(
                    '//*[@id="_container_product"]/table/tbody/tr[1]/td[4]/span'
                    '/text()')[0]
                # 领域
                product_item['domain'] = etc.xpath(
                    '//*[@id="_container_product"]/table/tbody/tr[1]/td[5]'
                    '/span/text()')[0]
                print(product_item)
                bixiao_product.find_one_and_update({'c_id': obj_id},
                                                   {'$set': product_item},
                                                   upsert=True)
                for product_vals in product_item:
                    if not product_item[product_vals]:
                        LOG.info(f'产品信息数据匹配异常:{product_item}, 请求地址:{comp_url}')

            elif div.get('tyc-event-ch') == 'CompangyDetail.zhaopin':  # 招聘信息
                recruit_item = {}
                recruit_item['c_id'] = obj_id
                recruit_item['company_name'] = detail_res.get(
                    'company_name', '')
                recruit_item['opd_date'] = etc.xpath(
                    '//*[@id="_container_baipin"]/table/tbody/tr[1]/td[2]'
                    '/text()')[0]
                recruit_item['position_'] = etc.xpath(
                    '//*[@id="_container_baipin"]/table/tbody/tr[1]/td[3]'
                    '/text()')[0]
                recruit_item['month_salary'] = etc.xpath(
                    '//*[@id="_container_baipin"]/table/tbody/tr[1]/td[4]'
                    '/text()')[0]
                recruit_item['education'] = etc.xpath(
                    '//*[@id="_container_baipin"]/table/tbody/tr[1]/td[5]'
                    '/text()')[0]
                recruit_item['work_experience'] = etc.xpath(
                    '//*[@id="_container_baipin"]/table/tbody/tr[1]/td[6]'
                    '/text()')[0]
                recruit_item['address'] = etc.xpath(
                    '//*[@id="_container_baipin"]/table/tbody/tr[1]/td[7]'
                    '/text()')[0]
                print(recruit_item, 'P' * 80)
                bixiao_recruit.find_one_and_update({'c_id': obj_id},
                                                   {'$set': recruit_item},
                                                   upsert=True)
                for recruit_vals in recruit_item:
                    if not recruit_item[recruit_vals]:
                        LOG.info(f'招聘信息数据匹配异常:{recruit_item}, 请求地址:{comp_url}')

            elif div.get('tyc-event-ch'
                         ) == 'CompangyDetail.lishiwangzhanbeian':  # ICP备案
                record_item = {}
                record_item['c_id'] = obj_id
                record_item['company_name'] = detail_res.get(
                    'company_name', '')
                record_item['opd_date'] = etc.xpath(
                    '//*[@id="_container_pastIcpList"]/table/tbody/tr/td[2]'
                    '/span/text()')[0]
                record_item['web_name'] = etc.xpath(
                    '//*[@id="_container_pastIcpList"]/table/tbody/tr/td[3]'
                    '/span/text()')[0]
                record_item['index_url'] = etc.xpath(
                    '//*[@id="_container_pastIcpList"]/table/tbody/tr/td[4]/div/'
                    'a/@href')[0]
                record_item['domain_name'] = etc.xpath(
                    '//*[@id="_container_pastIcpList"]/table/tbody/tr/td[5]'
                    '/text()')[0]
                record_item['website_filing'] = etc.xpath(
                    '//*[@id="_container_pastIcpList"]/table/tbody/tr/td[6]/'
                    'span/text()')[0]
                print(record_item, 'M' * 80)
                bixiao_record_icp.find_one_and_update({'c_id': obj_id},
                                                      {'$set': record_item},
                                                      upsert=True)
                for record_vals in record_item:
                    if not record_item[record_vals]:
                        LOG.info(f'ICP备案数据匹配异常:{record_item}, 请求地址:{comp_url}')

        print(detail_res, '%' * 80)
        bixiao_business.find_one_and_update({'c_id': obj_id},
                                            {'$set': detail_res},
                                            upsert=True)
        return detail_res
예제 #4
0
    def work_by_key(self, key):
        ret_res = list()
        if not key:
            LOG.error("【%s】key is null, no work." % RUN_MODE)
            return ret_res

        # page
        for page in range(1, self.MAX_PAGE, 1):
            url = '%s/p%s?key=%s' % (TYC_SEARCH_API, page, parse.quote(key))
            print(url)
            print(cookies_get())
            self.headers['Cookie'] = cookies_get()
            is_ok, search_resp = api_get(url=url,
                                         headers=self.headers,
                                         data={},
                                         resptype='text')
            if not is_ok:
                continue

            with open('company_list.html', 'w', encoding='utf-8') as wf:
                wf.write(search_resp)
            soup = BeautifulSoup(search_resp, 'lxml')
            tags = soup.find_all(
                'a', attrs={"tyc-event-ch": "CompanySearch.Company"})

            def while_req(url):
                sub_is_ok, sub_search_resp = api_get(url=url,
                                                     headers=self.headers,
                                                     data={},
                                                     resptype='text')
                return sub_is_ok, sub_search_resp

            # 添加手动验证功能
            if len(tags) == 0:
                while 1:
                    if is_ok and len(tags) > 0:
                        break
                    else:
                        LOG.critical('验证############### %s ###############' %
                                     url)
                        random_sleep(20, 25)
                        self.headers['Cookie'] = cookies_get()
                        is_ok, search_resp = while_req(url)
                        soup = BeautifulSoup(search_resp, 'lxml')
                        tags = soup.find_all(
                            'a',
                            attrs={"tyc-event-ch": "CompanySearch.Company"})

            for tag in tags:
                if not tag or not tag.attrs.get('href'):
                    continue

                res_dict = dict()
                res_dict['tyt_url'] = tag.get('href').strip()
                res_dict['name'] = tag.get_text().strip()

                self.save_list(
                    tag.get('href').strip() + '-' + tag.get_text().strip())
                # print(res_dict['name'], res_dict['tyt_url'], str(True if res_dict else False))
                print(res_dict)
                ret_res.append(res_dict)
                random_sleep(1, 2.5)
예제 #5
0
    def detail_by_url(self, comp_url: str):
        detail_res = dict()
        if not comp_url:
            return detail_res
        search_resp = comp_url
        soup = BeautifulSoup(search_resp, 'lxml')

        # header: 详情页 公司名称
        title_list = soup.find_all('div', class_="header")
        et2 = etree.HTML(search_resp)
        # if not title_list:
        #     return -1
        try:
            company_name = (title_list[0].find_all('h1', class_="name"))[0].get_text()
        except:
            name = et2.xpath('//*[@id="company_web_top"]/div[2]/div[3]/div[1]/h1/text()')
            company_name = ''.join(name)
        detail_res['company_name'] = company_name

        # 电话 更多联系方式
        # print(et2.xpath('//*[@id="company_web_top"]/div[2]/div[3]/div[3]/div[1]/div[1]/span[3]/script/text()'), 'OK '*80)
        origin_phone = et2.xpath('//*[@id="company_web_top"]/div[2]/div[3]/div[3]/div[1]/div[1]/span[3]/script/text()')

        # 邮箱 更多邮箱
        # print(et2.xpath('//*[@id="company_web_top"]/div[2]/div[3]/div[3]/div[1]/div[2]/span[3]/script/text()'), 'EMAIL '*80)
        origin_email = et2.xpath('//*[@id="company_web_top"]/div[2]/div[3]/div[3]/div[1]/div[2]/span[3]/script/text()')

        if origin_phone and origin_email:
            year_list = [i.get('showSource') for i in eval(origin_phone[0])]
            phone_item_vals = [i.get('phoneNumber') for i in eval(origin_phone[0])]
            email_list = eval(origin_email[0])
            contact_item = {}
            for contact in zip(year_list, phone_item_vals, email_list):
                contact_item['company_name'] = detail_res.get('company_name', '')
                contact_item['report_year'] = contact[0]
                contact_item['phone'] = contact[1]
                contact_item['email'] = contact[-1]
                contact_item['date_time'] = self.timestamp_to_strftime(time.time())
                print(contact_item, '@'*80)
                reslut = email_phone_insert(contact_item)
                if reslut.get('status', False):
                    print('插入成功')
                else:
                    print(reslut.get('msg'))

        # detail: 电话 邮箱 公司官网 地址 简介
        detail_div = soup.find_all('div', class_="detail")

        def while_req(url):
            sub_is_ok, sub_search_resp = api_get(url=url,
                                                 headers=self.headers,
                                                 data={},
                                                 resptype='text')
            return sub_is_ok, sub_search_resp

        # 添加手动验证功能
        if not detail_div:
            while 1:
                if detail_div:
                    break
                else:
                    LOG.critical('验证############### %s ###############' % comp_url)
                    random_sleep(20, 25)
                    self.headers['Cookie'] = cookies_get()
                    is_ok, search_resp = while_req(comp_url)
                    soup = BeautifulSoup(search_resp, 'lxml')
                    detail_div = soup.find_all('div', class_="detail")

        for div in detail_div[0].find_all('div'):
            if not div:
                continue

            # f0 电话 && 邮箱
            if div.get('class') == ['f0']:
                for big_index, big_child in enumerate(div):
                    if big_index == 0:
                        for index, child in enumerate(big_child.children):
                            if index == 1:
                                detail_res['phone'] = child.get_text().strip() or '-'
                                break
                    elif big_index == 1:
                        for index, child in enumerate(big_child.children):
                            if index == 1:
                                detail_res['email'] = child.get_text().strip() or '-'
                                break
                    else:
                        break
            # 公司官网 && 地址
            elif div.get('class') == ['f0', 'clearfix']:
                for big_index, big_child in enumerate(div):
                    if big_index == 0:
                        for index, child in enumerate(big_child.children):
                            if index == 1:
                                detail_res['company_url'] = child.get_text().strip() or '-'
                                break
                    elif big_index == 1:
                        for index, child in enumerate(big_child.children):
                            if index == 1:
                                for small_index, small_child in enumerate(child.children):
                                    if small_index == 0:
                                        detail_res['address'] = small_child.get_text().strip() or '-'
                                        break
                                break
                    else:
                        break
            # 简介
            elif div.get('class') == ['summary']:
                for big_index, big_child in enumerate(div):
                    if big_index == 0:
                        resume = big_child.string
                        if resume:
                            resume = resume.strip()
                        detail_res['resume'] = resume or '-'
                        break
                    else:
                        break
            else:
                continue

        # detail-list:
        detail_list_div = soup.find_all('div', class_="detail-list")
        if not detail_list_div:
            return detail_res

        etc = etree.HTML(search_resp)
        for div in detail_list_div[0].find_all('div'):
            if not div:
                continue

            # detail_res['source'] = '天眼查'
            # detail_res['created_time'] = self.timestamp_to_strftime(time.time())
            if div.get('tyc-event-ch') == 'CompangyDetail.gongshangxinxin':  # 工商信息
                registration_item = dict()
                for index_1, child_1 in enumerate(div.find_all('div', recursive=False)):
                    if index_1 == 1:
                        for index_1_1, child_1_1 in enumerate(child_1):
                            if index_1_1 == 2:
                                for index_tr, tr in enumerate(child_1_1.find_all('tr')):
                                    if index_tr == 0:
                                        for index_td, td in enumerate(tr.find_all('td')):
                                            if index_td == 1:  # 注册资本
                                                detail_res['register_funds'] = td.get_text().strip() or '-'
                                            elif index_td == 3:  # 实缴资金
                                                detail_res['paidin_funds'] = td.get_text().strip() or '-'
                                    elif index_tr == 1:
                                        for index_td, td in enumerate(tr.find_all('td')):
                                            if index_td == 1:  # 成立日期
                                                detail_res['establish_date'] = td.get_text().strip() or '-'
                                            elif index_td == 3:  # 经营状态
                                                detail_res['status'] = td.get_text().strip() or '-'
                                    elif index_tr == 2:
                                        for index_td, td in enumerate(tr.find_all('td')):
                                            if index_td == 1:  # 统一社会信用代码
                                                detail_res['credit_code'] = td.get_text().strip() or '-'
                                            elif index_td == 3:  # 工商注册号
                                                detail_res['registration_number'] = td.get_text().strip() or '-'
                                    elif index_tr == 3:
                                        for index_td, td in enumerate(tr.find_all('td')):
                                            if index_td == 1:  # 纳税人识别号
                                                detail_res['identification_number'] = td.get_text().strip() or '-'
                                            elif index_td == 3:  # 组织机构代码
                                                detail_res['organization_code'] = td.get_text().strip() or '-'
                                    elif index_tr == 4:
                                        for index_td, td in enumerate(tr.find_all('td')):
                                            if index_td == 1:  # 公司类型
                                                detail_res['company_type'] = td.get_text().strip() or '-'
                                            elif index_td == 3:  # 行业
                                                detail_res['industry'] = td.get_text().strip() or '-'
                                    elif index_tr == 6:
                                        for index_td, td in enumerate(tr.find_all('td')):
                                            if index_td == 1:  # 营业期限
                                                detail_res['business_term'] = td.get_text().strip() or '-'
                                            elif index_td == 3:  # 纳税人资质
                                                detail_res['taxpayer_qualification'] = td.get_text().strip() or '-'
                                    elif index_tr == 7:
                                        for index_td, td in enumerate(tr.find_all('td')):
                                            if index_td == 1:  # 人员规模
                                                detail_res['personnel_size'] = td.get_text().strip() or '-'
                                            elif index_td == 3:  # 参保人数
                                                detail_res['insured_num'] = td.get_text().strip() or '-'
                                    elif index_tr == 9:
                                        for index_td, td in enumerate(tr.find_all('td')):
                                            if index_td == 1:  # 注册地址
                                                detail_res['registered_address'] = td.get_text().strip() or '-'
                                    elif index_tr == 10:
                                        for index_td, td in enumerate(tr.find_all('td')):
                                            if index_td == 1:  # 经营范围
                                                detail_res['business_scope'] = td.get_text().strip() or '-'

                        break
                continue

            elif div.get('tyc-event-ch') == 'CompangyDetail.zhuyaorenyuan':  # 主要人员
                people_item = {}
                people_item['company_name'] = detail_res.get('company_name', '')
                # 姓名
                name = etc.xpath('//*[@id="_container_staff"]/div/table/tbody/tr/td[2]/table/tbody/tr/td[2]/a/text()')
                # 职位
                position = etc.xpath('//*[@id="_container_staff"]/div/table/tbody/tr/td[3]/span/text()')
                # 详情地址
                doc_url = etc.xpath(
                    '//*[@id="_container_staff"]/div/table/tbody/tr[1]/td[2]/table/tbody/tr/td[3]/a/@href')

                for people in zip(name, position, doc_url):
                    people_item['name'] = people[0]
                    people_item['position'] = people[1]
                    people_item['doc_url'] = people[2]
                    people_item['created_time'] = self.timestamp_to_strftime(time.time())

                    result = people_insert(people_item)
                    if result.get('status', False):
                        print(result)
                    else:
                        LOG.debug(f'')

                    bixiao_people.find_one_and_update({'doc_url': detail_res.get('doc_url', '')},
                                                      {'$set': people_item}, upsert=True)
                    print(people_item)


            elif div.get('tyc-event-ch') == 'CompangyDetail.gudongxinxi':  # 股东信息
                capital_item = {}
                capital_item['company_name'] = detail_res.get('company_name', '')
                # 股东名称
                title = etc.xpath('//*[@id="_container_holder"]/table/tbody/tr/td[2]/table/tbody/tr/td[2]/a/text()')
                # 标签
                label = etc.xpath(
                    '//*[@id="_container_holder"]/table/tbody/tr/td[2]/table/tbody/tr/td[2]/div/span/text()')
                # 持股比例
                has_rates = etc.xpath('//*[@id="_container_holder"]/table/tbody/tr/td[3]/div/div/span/text()')
                # 认缴出资额
                subscribed_capital = etc.xpath('//*[@id="_container_holder"]/table/tbody/tr/td[4]/div/span/text()')
                # 详情地址
                doc_url = etc.xpath('//*[@id="_container_holder"]/table/tbody/tr/td[2]/table/tbody/tr/td[3]/a/@href')

                for capital in zip(title, label, has_rates, subscribed_capital, doc_url):
                    capital_item['title'] = ''.join(capital[0])
                    capital_item['label'] = ''.join(capital[1])
                    capital_item['has_rates'] = ''.join(capital[2])
                    capital_item['subscribed_capital'] = ''.join(capital[3])
                    capital_item['doc_url'] = capital[4]
                    capital_item['created_time'] = self.timestamp_to_strftime(time.time())
                    bixiao_shareholder.find_one_and_update({'doc_url': detail_res.get('doc_url', '')},
                                                           {'$set': capital_item}, upsert=True)
                    print(capital_item, 'C' * 80)


            elif div.get('tyc-event-ch') == 'CompangyDetail.findNewsCount':  # 新闻舆情
                news_item = {}
                news_item['company_name'] = detail_res.get('company_name', '')
                # 标题
                title = etc.xpath('//*[@id="_container_findNewsCount"]/div[1]/div[1]/div/div[1]/a/text()')
                # 内容地址
                info_url = etc.xpath('//*[@id="_container_findNewsCount"]/div[1]/div[1]/div/div[1]/a/@href')
                # 来源
                source = etc.xpath('//*[@id="_container_findNewsCount"]/div[1]/div[1]/div/div[3]/span[1]/text()')
                # 发布时间
                date_doc = etc.xpath('//*[@id="_container_findNewsCount"]/div[1]/div[1]/div/div[3]/span[2]/text()')
                for news_datas in zip(title, info_url, source, date_doc):
                    news_item['title'] = news_datas[0]
                    news_item['info_url'] = news_datas[1]
                    news_item['source'] = news_datas[2]
                    news_item['date_doc'] = news_datas[3]
                    news_item['content'] = self.request_doing(url=news_datas[1], headers=self.headers, params={})
                    news_item['created_time'] = self.timestamp_to_strftime(time.time())

                    print(news_item)
                    bixiao_news.update({'info_url': detail_res.get('info_url', '')}, {'$set': news_item}, upsert=True)


            elif div.get('tyc-event-ch') == 'CompangyDetail.chanpin':  # 产品信息
                product_item = {}
                product_item['company_name'] = detail_res.get('company_name', '')
                # 产品名称
                name = etc.xpath('//*[@id="_container_product"]/table/tbody/tr/td[2]/table'
                                 '/tbody/tr/td[2]/span/text()')
                # 产品简称
                short_name = etc.xpath('//*[@id="_container_product"]/table/tbody/tr/td[3]'
                                       '/span/text()')
                # 产品分类
                type = etc.xpath('//*[@id="_container_product"]/table/tbody/tr/td[4]/span'
                                 '/text()')
                # 领域
                domain = etc.xpath('//*[@id="_container_product"]/table/tbody/tr/td[5]'
                                   '/span/text()')
                # 详情地址
                doc_url = etc.xpath('//*[@id="_container_product"]/table/tbody/tr/td[6]/a/@href')

                for product in zip(name, short_name, type, domain, doc_url):
                    product_item['name'] = product[0]
                    product_item['short_name'] = product[1]
                    product_item['type'] = product[2]
                    product_item['domain'] = product[3]
                    product_item['doc_url'] = product[4]
                    product_item['doc_info'] = self.request_doing(url=product[4], headers=self.headers, params={})
                    product_item['created_time'] = self.timestamp_to_strftime(time.time())

                    print(product_item)
                    bixiao_product.find_one_and_update({'doc_url': detail_res.get('doc_url', '')},
                                                       {'$set': product_item}, upsert=True)


            elif div.get('tyc-event-ch') == 'CompangyDetail.zhaopin':  # 招聘信息
                recruit_item = {}
                recruit_item['company_name'] = detail_res.get('company_name', '')
                opd_date = etc.xpath('//*[@id="_container_baipin"]/table/tbody/tr/td[2]'
                                     '/text()')
                position_ = etc.xpath('//*[@id="_container_baipin"]/table/tbody/tr/td[3]'
                                      '/text()')
                month_salary = etc.xpath('//*[@id="_container_baipin"]/table/tbody/tr/td[4]'
                                         '/text()')
                education = etc.xpath('//*[@id="_container_baipin"]/table/tbody/tr/td[5]'
                                      '/text()')
                work_experience = etc.xpath('//*[@id="_container_baipin"]/table/tbody/tr/td[6]'
                                            '/text()')
                address = etc.xpath('//*[@id="_container_baipin"]/table/tbody/tr/td[7]'
                                    '/text()')
                opd_url = etc.xpath('//*[@id="_container_baipin"]/table/tbody/tr/td[8]/a/@href')

                for recruit in zip(opd_date, position_, month_salary, education, work_experience, address, opd_url):
                    recruit_item['opd_date'] = recruit[0]
                    recruit_item['position_'] = recruit[1]
                    recruit_item['month_salary'] = recruit[2]
                    recruit_item['education'] = recruit[3]
                    recruit_item['work_experience'] = recruit[4]
                    recruit_item['address'] = recruit[5]
                    recruit_item['opd_url'] = recruit[6]
                    recruit_item['created_time'] = self.timestamp_to_strftime(time.time())

                    print(recruit_item, 'P' * 80)
                    bixiao_recruit.find_one_and_update({'opd_url': detail_res.get('opd_url', '')},
                                                       {'$set': recruit_item}, upsert=True)


            elif div.get('tyc-event-ch') == 'CompangyDetail.lishiwangzhanbeian':  # ICP备案
                record_item = {}
                record_item['company_name'] = detail_res.get('company_name', '')
                # 审核日期
                opd_date = etc.xpath('//*[@id="_container_pastIcpList"]/table/tbody/tr/td[2]'
                                     '/span/text()')
                # 网站名称
                web_name = etc.xpath('//*[@id="_container_pastIcpList"]/table/tbody/tr/td[3]'
                                     '/span/text()')
                # 网站首页
                index_url = etc.xpath('//*[@id="_container_pastIcpList"]/table/tbody/tr/td[4]/div/'
                                      'a/@href')
                # 域名
                domain_name = etc.xpath('//*[@id="_container_pastIcpList"]/table/tbody/tr/td[5]'
                                        '/text()')
                # 网站备案/许可证号
                website_filing = etc.xpath('//*[@id="_container_pastIcpList"]/table/tbody/tr/td[6]/'
                                           'span/text()')

                for record in zip(opd_date, web_name, index_url, domain_name, website_filing):
                    record_item['opd_date'] = record[0]
                    record_item['web_name'] = record[1]
                    record_item['index_url'] = record[2]
                    record_item['domain_name'] = record[3]
                    record_item['website_filing'] = record[4]
                    record_item['created_time'] = self.timestamp_to_strftime(time.time())

                    res = record_icp_insert(record_item)
                    if res.get('status', False):
                        print(res)
                    else:
                        LOG.debug(f'企业年报入库异常: {res.get("msg")}...')

                    # print(record_item, 'M' * 80)
                    # bixiao_record_icp.find_one_and_update({'index_url': detail_res.get('index_url', '')},
                    #                                       {'$set': record_item}, upsert=True)

            elif div.get('tyc-event-ch') == 'CompangyDetail.rongzilishi':     # 融资历程
                financing_item = dict()
                financing_item['company_name'] = detail_res.get('company_name', '')
                # 披露日期
                opd_date = etc.xpath('//*[@id="_container_rongzi"]/table/tbody/tr/td[2]/text()')
                # 交易金额
                change_money = etc.xpath('//*[@id="_container_rongzi"]/table/tbody/tr/td[3]/text()')
                # 融资轮次
                financing_round = etc.xpath('//*[@id="_container_rongzi"]/table/tbody/tr/td[4]/div[1]/text()')
                # 估值
                valuation = etc.xpath('//*[@id="_container_rongzi"]/table/tbody/tr/td[5]/text()')
                # 比例
                proportion = etc.xpath('//*[@id="_container_rongzi"]/table/tbody/tr/td[6]/text()')
                # 投资方
                investor = etc.xpath('//*[@id="_container_rongzi"]/table/tbody/tr/td[7]/div/a/text()')
                # 新闻来源
                news_source = etc.xpath('//*[@id="_container_rongzi"]/table/tbody/tr/td[8]/div/text()')

                for financing in zip(opd_date, change_money, financing_round, valuation,
                                     proportion, investor, news_source):
                    financing_item['opd_date'] = financing[0]
                    financing_item['change_money'] = financing[1]
                    financing_item['financing_round'] = financing[2]
                    financing_item['valuation'] = financing[3]
                    financing_item['proportion'] = financing[4]
                    financing_item['investor'] = financing[5]
                    financing_item['news_source'] = financing[6]
                    financing_item['created_time'] = self.timestamp_to_strftime(time.time())

                    print(financing_item, 'F'*80)
                    res = financing_insert(financing_item)
                    if res.get('status', False):
                        print(res)
                    else:
                        LOG.debug(f'融资历程入库异常: {res.get("msg")}...')

            elif div.get('tyc-event-ch') == 'CompangyDetail.nianbao':     # 企业年报
                reports_item = dict()
                reports_item['company_name'] = detail_res.get('company_name', '')
                # 年报
                reports = etc.xpath('//*[@id="web-content"]/div/div/div[5]/div[1]/div/div[2]/div[1]/div[15]/div[2]'
                                    '/div/table/tbody/tr/td[2]/text()')
                # 详情地址
                operation = etc.xpath('//*[@id="web-content"]/div/div/div[5]/div[1]/div/div[2]/div[1]/div[15]/div[2]'
                                      '/div/table/tbody/tr/td[3]/a/@href')
                for annual in zip(reports, operation):
                    reports_item['reports'] = annual[0]
                    reports_item['operation'] = annual[1]
                    reports_item['reports_info'] = self.request_doing(url=operation, headers=self.headers, params={})
                    reports_item['created_time'] = self.timestamp_to_strftime(time.time())

                    print(reports_item, '?'*80)
                    res = reports_insert(reports_item)
                    if res.get('status', False):
                        print(res)
                    else:
                        LOG.debug(f'企业年报入库异常: {res.get("msg")}...')


        print(detail_res, '%' * 80)
        bixiao_business.find_one_and_update({'company_name': detail_res.get('company_name', '')},
                                            {'$set': detail_res}, upsert=True)
예제 #6
0
    def get_pagination(self,
                       key,
                       _type='default',
                       city_id=None,
                       sub_city_id=None,
                       cityes=None,
                       sub_city_info=None):
        min_page = self.MIN_PAGE
        max_page = self.MAX_PAGE

        if (max_page - min_page) / PAGINATION == (max_page -
                                                  min_page) // PAGINATION:
            max_range = (max_page - min_page) // PAGINATION
        else:
            max_range = (max_page - min_page) // PAGINATION + 1

        if API_MODE not in ['tyc', 'pro']:
            return min_page, max_page, max_page, max_range

        if API_MODE == 'tyc' and _type == 'city':
            city_info = cityes.get(city_id)
            url = '%s?key=%s&base=%s' % (TYC_SEARCH_API, parse.quote(key),
                                         city_info.get('name'))
        elif API_MODE == 'tyc' and _type == 'sub_city':
            city_info = cityes.get(city_id)
            if city_id in ZXS_CITY_IDS:
                url = '%s?key=%s&base=%s&areaCode=%s&baseArea=%s' \
                      % (TYC_SEARCH_API, parse.quote(key), city_info.get('name'), sub_city_info.get('code'), parse.quote(sub_city_info.get('name')))
            else:
                url = '%s?key=%s&base=%s' % (TYC_SEARCH_API, parse.quote(key),
                                             sub_city_info.get('name'))
        elif API_MODE == 'tyc':
            url = '%s?key=%s' % (TYC_SEARCH_API, parse.quote(key))
        elif API_MODE == 'pro' and _type == 'city':
            city_info = cityes.get(city_id)
            url = '%s?key=%s&base=%s' % (TYC_PRO_SEARCH_API, parse.quote(key),
                                         city_info.get('name'))
        elif API_MODE == 'pro' and _type == 'sub_city':
            city_info = cityes.get(city_id)
            if city_id in ZXS_CITY_IDS:
                url = '%s?key=%s&base=%s&areaCode=%s&baseArea=%s' \
                      % (TYC_PRO_SEARCH_API, parse.quote(key), city_info.get('name'), sub_city_info.get('code'), parse.quote(sub_city_info.get('name')))
            else:
                url = '%s?key=%s&base=%s' % (TYC_PRO_SEARCH_API,
                                             parse.quote(key),
                                             sub_city_info.get('name'))
        elif API_MODE == 'pro':
            url = '%s?key=%s' % (TYC_PRO_SEARCH_API, parse.quote(key))

        self.headers['Referer'] = url
        is_ok, search_resp = api_get(url=url,
                                     headers=self.headers,
                                     data={},
                                     resptype='text')

        soup = BeautifulSoup(search_resp, 'lxml')
        search_pagination = soup.find_all('div', class_='search-pagination')

        # 仅一页
        if is_ok and not search_pagination:
            return 0, 1, 1, 1

        def while_req(url):
            sub_is_ok, sub_search_resp = api_get(url=url,
                                                 headers=self.headers,
                                                 data={},
                                                 resptype='text')
            return sub_is_ok, sub_search_resp

        # 添加手动验证功能
        if len(search_pagination) == 0 or not is_ok:
            while 1:
                if is_ok and len(search_pagination) > 0:
                    break
                else:
                    LOG.critical('验证############### %s ###############' % url)
                    random_sleep(20, 25)
                    is_ok, search_resp = while_req(url)
                    soup = BeautifulSoup(search_resp, 'lxml')
                    search_pagination = soup.find_all(
                        'div', class_='search-pagination')

        l = len(search_pagination[0].find_all('a'))
        for index_a, a in enumerate(search_pagination[0].find_all('a')):
            if index_a == (l - 2):
                max_page = a.string.strip()
                if max_page.find('...') > -1:
                    max_page = max_page.split('...')[1]
                    if isinstance(max_page, str):
                        max_page = int(max_page)
                break

        max_pagination = max_page
        if MIN_PAGE:
            min_page = int(MIN_PAGE)
        if MAX_PAGE:
            max_page = int(MAX_PAGE) if (int(MAX_PAGE) < int(max_pagination)) \
                else int(max_pagination)

        if min_page == max_page:
            max_range = 1
        elif min_page > max_page:
            LOG.critical('Page min and max is error: min[%s] max[%s]' %
                         (min_page, max_page))
            sys.exit()
        else:
            if (max_page - min_page) / PAGINATION == (max_page -
                                                      min_page) // PAGINATION:
                max_range = (max_page - min_page) // PAGINATION
            else:
                max_range = (max_page - min_page) // PAGINATION + 1

        return min_page, max_page, max_pagination, max_range
예제 #7
0
    def detail_pro_by_url(self, comp_url: str):
        detail_res = dict()
        if not comp_url:
            return detail_res

        self.headers['Referer'] = comp_url
        is_ok, search_resp = api_get(url=comp_url,
                                     headers=self.headers,
                                     data={},
                                     resptype='text')
        if not is_ok:
            print('X-' * 100)
            print(comp_url)
            return detail_res

        soup = BeautifulSoup(search_resp, 'lxml')

        # detail: 电话 邮箱 公司官网 地址 简介
        detail_div = soup.find_all('div', class_="ie9Style")

        def while_req(url):
            sub_is_ok, sub_search_resp = api_get(url=url,
                                                 headers=self.headers,
                                                 data={},
                                                 resptype='text')
            return sub_is_ok, sub_search_resp

        # 添加手动验证功能
        if not detail_div:
            while 1:
                if is_ok and detail_div:
                    break
                else:
                    LOG.critical('验证############### %s ###############' %
                                 comp_url)
                    random_sleep(20, 25)
                    is_ok, search_resp = while_req(comp_url)
                    soup = BeautifulSoup(search_resp, 'lxml')
                    detail_div = soup.find_all('div', class_="ie9Style")

        # 0 企业缩略图 1 基础信息 2 下载
        for index, div in enumerate(detail_div[1].find_all('div',
                                                           recursive=False)):
            if not div:
                continue

            # 电话 && 邮箱
            if index == 1:
                for big_index, big_child in enumerate(div):
                    if big_index == 0:
                        for sub_index, child in enumerate(big_child.children):
                            if sub_index == 1:
                                detail_res['phone'] = child.get_text().strip(
                                ) or '-'
                    elif big_index == 1:
                        for sub_index, child in enumerate(big_child.children):
                            if sub_index == 1:
                                detail_res['email'] = child.get_text().strip(
                                ) or '-'
            # 公司官网 && 地址
            elif index == 2:
                for big_index, big_child in enumerate(div):
                    if big_index == 0:
                        for sub_index, child in enumerate(big_child.children):
                            if sub_index == 1:
                                detail_res['company_url'] = child.get_text(
                                ).strip() or '-'
                    elif big_index == 1:
                        for sub_index, child in enumerate(big_child.children):
                            if sub_index == 1:
                                detail_res['address'] = child.get_text().strip(
                                ) or '-'
                                break
            # 简介
            elif index == 3:
                for big_index, big_child in enumerate(div):
                    if big_index == 0:
                        for sub_index, sub_child in enumerate(big_child):
                            if sub_index == 1:
                                resume = sub_child.string
                                if resume:
                                    resume = resume.strip()
                                detail_res['resume'] = resume or '-'
                                break
                    break
            else:
                continue

        # detail-list: 信用代码 公司类型 所属行业 营业期限 实缴资本 经营范围
        detail_list_div = soup.find_all('div', class_='base0910')
        if not detail_list_div:
            return detail_res

        for index_tr, tr in enumerate(detail_list_div[0].find_all('tr')):
            if not tr:
                continue

            if index_tr == 1:
                for index_td, td in enumerate(
                        tr.find_all('td', recursive=False)):
                    if index_td == 1:  # 信用代码
                        detail_res['credit_code'] = td.get_text().strip(
                        ) or '-'
                    elif index_td == 3:  # 公司类型
                        detail_res['company_type'] = td.get_text().strip(
                        ) or '-'
            elif index_tr == 2:
                for index_td, td in enumerate(
                        tr.find_all('td', recursive=False)):
                    if index_td == 3:  # 所属行业
                        detail_res['industry'] = td.get_text().strip() or '-'
            elif index_tr == 3:
                for index_td, td in enumerate(
                        tr.find_all('td', recursive=False)):
                    if index_td == 1:  # 营业期限
                        detail_res['business_term'] = td.get_text().strip(
                        ) or '-'
            elif index_tr == 4:
                for index_td, td in enumerate(
                        tr.find_all('td', recursive=False)):
                    if index_td == 3:  # 实缴资本
                        detail_res['paidin_funds'] = td.get_text().strip(
                        ) or '-'
            elif index_tr == 8:
                for index_td, td in enumerate(
                        tr.find_all('td', recursive=False)):
                    if index_td == 1:  # 经营范围
                        detail_res['business_scope'] = td.get_text().strip(
                        ) or '-'

        # detail-list: 注册资金 注册日期 经营状态
        detail_list_div_1 = soup.find_all('div', class_='baseInfo_model2017')
        if not detail_list_div:
            return detail_res

        for index_table, table in enumerate(
                detail_list_div_1[0].find_all('table')):
            if not table:
                continue

            if index_table == 1:
                for index_tr, tr in enumerate(table.find_all('tr')):
                    if index_tr == 1:
                        for index_td, td in enumerate(
                                tr.find_all('td', recursive=False)):
                            if index_td == 1:
                                for index_td_span, td_span in enumerate(
                                        td.find_all('span')):
                                    if index_td_span == 1:
                                        detail_res[
                                            'register_funds'] = td_span.get_text(
                                            ).strip() or '-'
                    elif index_tr == 2:
                        for index_td, td in enumerate(
                                tr.find_all('td', recursive=False)):
                            if index_td == 0:
                                for index_td_span, td_span in enumerate(
                                        td.find_all('span')):
                                    if index_td_span == 1:
                                        detail_res[
                                            'establish_date'] = td_span.get_text(
                                            ).strip() or '-'
                    elif index_tr == 3:
                        for index_td, td in enumerate(
                                tr.find_all('td', recursive=False)):
                            if index_td == 0:
                                for index_td_div, td_div in enumerate(
                                        td.find_all('div', recursive=False)):
                                    if index_td_div == 0:
                                        for index_td_div_span, td_div_span in enumerate(
                                                td_div.find_all(
                                                    'span', recursive=False)):
                                            if index_td_div_span == 1:
                                                detail_res[
                                                    'status'] = td_div_span.get_text(
                                                    ).strip() or '-'

        return detail_res
예제 #8
0
    def detail_by_url(self, comp_url: str):
        detail_res = dict()
        if not comp_url:
            return detail_res

        self.headers['Referer'] = comp_url
        is_ok, search_resp = api_get(url=comp_url,
                                     headers=self.headers,
                                     data={},
                                     resptype='text')
        if not is_ok:
            return detail_res

        soup = BeautifulSoup(search_resp, 'lxml')

        # detail: 电话 邮箱 公司官网 地址 简介
        detail_div = soup.find_all('div', class_="detail")

        def while_req(url):
            sub_is_ok, sub_search_resp = api_get(url=url,
                                                 headers=self.headers,
                                                 data={},
                                                 resptype='text')
            return sub_is_ok, sub_search_resp

        # 添加手动验证功能
        if not detail_div:
            while 1:
                if is_ok and detail_div:
                    break
                else:
                    LOG.critical('验证############### %s ###############' %
                                 comp_url)
                    random_sleep(20, 25)
                    is_ok, search_resp = while_req(comp_url)
                    soup = BeautifulSoup(search_resp, 'lxml')
                    detail_div = soup.find_all('div', class_="detail")

        for div in detail_div[0].find_all('div'):
            if not div:
                continue

            # f0 电话 && 邮箱
            if div.get('class') == ['f0']:
                for big_index, big_child in enumerate(div):
                    if big_index == 0:
                        for index, child in enumerate(big_child.children):
                            if index == 1:
                                detail_res['phone'] = child.get_text().strip(
                                ) or '-'
                                break
                    elif big_index == 1:
                        for index, child in enumerate(big_child.children):
                            if index == 1:
                                detail_res['email'] = child.get_text().strip(
                                ) or '-'
                                break
                    else:
                        break
            # 公司官网 && 地址
            elif div.get('class') == ['f0', 'clearfix']:
                for big_index, big_child in enumerate(div):
                    if big_index == 0:
                        for index, child in enumerate(big_child.children):
                            if index == 1:
                                detail_res['company_url'] = child.get_text(
                                ).strip() or '-'
                                break
                    elif big_index == 1:
                        for index, child in enumerate(big_child.children):
                            if index == 1:
                                for small_index, small_child in enumerate(
                                        child.children):
                                    if small_index == 0:
                                        detail_res[
                                            'address'] = small_child.get_text(
                                            ).strip() or '-'
                                        break
                                break
                    else:
                        break
            # 简介
            elif div.get('class') == ['summary']:
                for big_index, big_child in enumerate(div):
                    if big_index == 0:
                        resume = big_child.string
                        if resume:
                            resume = resume.strip()
                        detail_res['resume'] = resume or '-'
                        break
                    else:
                        break
            else:
                continue

        # detail-list:
        detail_list_div = soup.find_all('div', class_="detail-list")
        if not detail_list_div:
            return detail_res

        for div in detail_list_div[0].find_all('div'):
            if not div:
                continue

            if div.get('tyc-event-ch') == 'CompangyDetail.gongshangxinxin':
                for index_1, child_1 in enumerate(
                        div.find_all('div', recursive=False)):
                    if index_1 == 1:
                        for index_1_1, child_1_1 in enumerate(child_1):
                            if index_1_1 == 2:
                                for index_tr, tr in enumerate(
                                        child_1_1.find_all('tr')):
                                    if index_tr == 0:
                                        for index_td, td in enumerate(
                                                tr.find_all('td')):
                                            if index_td == 1:  # 注册资本
                                                detail_res[
                                                    'register_funds'] = td.get_text(
                                                    ).strip() or '-'
                                            elif index_td == 3:  # 实缴资金
                                                detail_res[
                                                    'paidin_funds'] = td.get_text(
                                                    ).strip() or '-'
                                    elif index_tr == 1:
                                        for index_td, td in enumerate(
                                                tr.find_all('td')):
                                            if index_td == 1:  # 成立日期
                                                detail_res[
                                                    'establish_date'] = td.get_text(
                                                    ).strip() or '-'
                                            elif index_td == 3:  # 经营状态
                                                detail_res[
                                                    'status'] = td.get_text(
                                                    ).strip() or '-'
                                    elif index_tr == 2:
                                        for index_td, td in enumerate(
                                                tr.find_all('td')):
                                            if index_td == 1:  # 信用代码
                                                detail_res[
                                                    'credit_code'] = td.get_text(
                                                    ).strip() or '-'
                                    elif index_tr == 4:
                                        for index_td, td in enumerate(
                                                tr.find_all('td')):
                                            if index_td == 1:  # 公司类型
                                                detail_res[
                                                    'company_type'] = td.get_text(
                                                    ).strip() or '-'
                                            elif index_td == 3:  # 行业
                                                detail_res[
                                                    'industry'] = td.get_text(
                                                    ).strip() or '-'
                                    elif index_tr == 6:
                                        for index_td, td in enumerate(
                                                tr.find_all('td')):
                                            if index_td == 1:  # 营业期限
                                                detail_res[
                                                    'business_term'] = td.get_text(
                                                    ).strip() or '-'
                                    elif index_tr == 10:
                                        for index_td, td in enumerate(
                                                tr.find_all('td')):
                                            if index_td == 1:  # 经营范围
                                                detail_res[
                                                    'business_scope'] = td.get_text(
                                                    ).strip() or '-'

                        break
                break
        return detail_res
예제 #9
0
    def work_by_key(self,
                    key,
                    min_page,
                    max_page,
                    type='default',
                    queue=None,
                    cid=None,
                    sub_cid=None,
                    city_info=None,
                    sub_city_info=None):
        ret_res = list()
        if not key:
            LOG.error("【%s】key is null, no work." % RUN_MODE)
            return ret_res

        # page
        for page in range(min_page, max_page + 1, 1):
            if API_MODE == 'tyc' and type == 'default':
                url = '%s/p%s?key=%s' % (TYC_SEARCH_API, page,
                                         parse.quote(key))
            elif API_MODE == 'tyc' and type == 'city':
                url = '%s/p%s?key=%s&base=%s' % (TYC_SEARCH_API, page,
                                                 parse.quote(key),
                                                 city_info.get('name'))
            elif API_MODE == 'tyc' and type == 'sub_city':
                if cid in ZXS_CITY_IDS:
                    url = '%s/p%s?key=%s&base=%s&areaCode=%s' % (
                        TYC_SEARCH_API, page, parse.quote(key),
                        sub_city_info.get('name'), sub_city_info.get('code'))
                else:
                    url = '%s/p%s?key=%s&base=%s' % (TYC_SEARCH_API, page,
                                                     parse.quote(key),
                                                     sub_city_info.get('name'))
            elif API_MODE == 'pro' and type == 'default':
                url = '%s/p%s?key=%s' % (TYC_PRO_SEARCH_API, page,
                                         parse.quote(key))
            elif API_MODE == 'pro' and type == 'city':
                url = '%s/p%s?key=%s&base=%s' % (TYC_PRO_SEARCH_API, page,
                                                 parse.quote(key),
                                                 city_info.get('name'))
            elif API_MODE == 'pro' and type == 'sub_city':
                if cid in ZXS_CITY_IDS:
                    url = '%s/p%s?key=%s&base=%s&areaCode=%s&baseArea=%s' \
                          % (TYC_PRO_SEARCH_API, page, parse.quote(key), city_info.get('name'), sub_city_info.get('code'), parse.quote(sub_city_info.get('name')))
                else:
                    url = '%s/p%s?key=%s&base=%s' % (TYC_PRO_SEARCH_API, page,
                                                     parse.quote(key),
                                                     sub_city_info.get('name'))
            else:
                LOG.critical('====== API_MODE is not in [tyc, pro] ======')
                sys.exit(1)
            LOG.info('%s[%s]%s' % (key, API_MODE, url))

            self.headers['Referer'] = url
            is_ok, search_resp = api_get(url=url,
                                         headers=self.headers,
                                         data={},
                                         resptype='text')
            if not is_ok:
                continue
            if self.check_no(url, _type='page'):
                continue

            soup = BeautifulSoup(search_resp, 'lxml')
            tags = soup.find_all(
                'a', attrs={"tyc-event-ch": "CompanySearch.Company"})

            def while_req(url):
                sub_is_ok, sub_search_resp = api_get(url=url,
                                                     headers=self.headers,
                                                     data={},
                                                     resptype='text')
                return sub_is_ok, sub_search_resp

            # 添加手动验证功能
            if len(tags) == 0:
                while 1:
                    if is_ok and len(tags) > 0:
                        break
                    else:
                        LOG.critical('验证############### %s ###############' %
                                     url)
                        random_sleep(20, 25)
                        is_ok, search_resp = while_req(url)
                        soup = BeautifulSoup(search_resp, 'lxml')
                        tags = soup.find_all(
                            'a',
                            attrs={"tyc-event-ch": "CompanySearch.Company"})

            for tag in tags:
                if not tag or not tag.attrs.get('href'):
                    continue

                res_dict = dict()
                if API_MODE == 'tyc':
                    tyc_url = tag.get('href').strip()
                elif API_MODE == 'pro':
                    tyc_url = '%s%s/background' % (TYC_PRO_DETAIL_API,
                                                   tag.get('href').strip())
                else:
                    tyc_url = ''
                res_dict['tyc_url'] = tyc_url
                res_dict['name'] = tag.get_text().strip()
                res_dict['key'] = key
                res_dict['is_send_email'] = False
                res_dict['city'] = city_info.get(
                    'full_name') if city_info else '-'
                res_dict['sub_city'] = sub_city_info.get(
                    'full_name') if sub_city_info else '-'
                detail_res = list()
                if API_MODE == 'tyc':
                    detail_res = self.detail_by_url(res_dict.get('tyc_url'))
                elif API_MODE == 'pro':
                    detail_res = self.detail_pro_by_url(
                        res_dict.get('tyc_url'))
                res_dict.update(detail_res)
                print('%s[%s] %s' %
                      (res_dict['name'], str(True if res_dict else False),
                       res_dict['tyc_url']))
                ret_res.append(res_dict)
                if queue:
                    queue.put(res_dict)
                random_sleep(3.5, 4.5)
                if IS_TEST_BREAK:
                    break
            if IS_TEST_BREAK:
                break
        return ret_res
예제 #10
0
파일: base.py 프로젝트: XieFengCheng/xfc
------------------------------------------------
"""
import sys

from sqlalchemy import create_engine
from sqlalchemy.ext.declarative import declarative_base
from sqlalchemy.orm import sessionmaker

from deploy.config import DB_LINK
from deploy.utils.logger import logger as LOG

DBSession = None

if not DB_LINK:
    LOG.critical('DB configuration is unavail')
    sys.exit(1)

db_link = DB_LINK

ModelBase = declarative_base()


def init_database_engine():
    return create_engine(db_link, echo=False, pool_recycle=800, pool_size=100)


def get_session():
    global DBSession
    if not DBSession:
        dbengine_databus = init_database_engine()
예제 #11
0
    def work_by_key(self, key, min_page=0, max_page=5, queue=None):
        ret_res = list()
        if not key:
            LOG.error("【%s】key is null, no work." % RUN_MODE)
            return ret_res

        if not min_page:
            min_page = self.MIN_PAGE
        if not max_page:
            max_page = self.MAX_PAGE

        LOG.info('%s[%s ~ %s]' % (key, min_page, max_page))
        # page
        for page in range(min_page, max_page, 1):
            if API_MODE == 'tyc':
                url = '%s/p%s?key=%s' % (TYC_SEARCH_API, page,
                                         parse.quote(key))
            elif API_MODE == 'pro':
                url = '%s/p%s?key=%s' % (TYC_PRO_SEARCH_API, page,
                                         parse.quote(key))
            else:
                LOG.critical('====== API_MODE is not in [tyc, pro] ======')
                sys.exit(1)
            LOG.info('%s[%s]%s' % (key, API_MODE, url))

            is_ok, search_resp = api_get(url=url,
                                         headers=self.headers,
                                         data={},
                                         resptype='text')

            if not is_ok:
                continue

            soup = BeautifulSoup(search_resp, 'lxml')
            tags = soup.find_all(
                'a', attrs={"tyc-event-ch": "CompanySearch.Company"})

            def while_req(url):
                sub_is_ok, sub_search_resp = api_get(url=url,
                                                     headers=self.headers,
                                                     data={},
                                                     resptype='text')
                return sub_is_ok, sub_search_resp

            # 添加手动验证功能
            if len(tags) == 0:
                while 1:
                    if is_ok and len(tags) > 0:
                        break
                    else:
                        LOG.critical('验证############### %s ###############' %
                                     url)
                        random_sleep(20, 25)
                        is_ok, search_resp = while_req(url)
                        soup = BeautifulSoup(search_resp, 'lxml')
                        tags = soup.find_all(
                            'a',
                            attrs={"tyc-event-ch": "CompanySearch.Company"})

            for tag in tags:
                if not tag or not tag.attrs.get('href'):
                    continue

                res_dict = dict()
                if API_MODE == 'tyc':
                    tyc_url = tag.get('href').strip()
                elif API_MODE == 'pro':
                    tyc_url = '%s%s/background' % (TYC_PRO_DETAIL_API,
                                                   tag.get('href').strip())
                else:
                    tyc_url = ''
                res_dict['tyc_url'] = tyc_url
                res_dict['name'] = tag.get_text().strip()
                res_dict['key'] = key
                detail_res = list()
                if API_MODE == 'tyc':
                    detail_res = self.detail_by_url(res_dict.get('tyc_url'))
                elif API_MODE == 'pro':
                    detail_res = self.detail_pro_by_url(
                        res_dict.get('tyc_url'))
                res_dict.update(detail_res)
                print('%s[%s] %s' %
                      (res_dict['name'], str(True if res_dict else False),
                       res_dict['tyc_url']))
                ret_res.append(res_dict)
                if queue:
                    queue.put(res_dict)
                random_sleep(3.2, 4.5)
                if IS_TEST_BREAK:
                    break
            if IS_TEST_BREAK:
                break
        return ret_res
예제 #12
0
 def _die(self, message: str = None):
     if message:
         LOG.critical(message)
     os._exit(0)