Пример #1
0
 def __init__(self):
     self.db = MongoDB('172.16.74.249:27017', 'db_reptile_company',
                       'company_name')
     self.redis = REDIS(host=RedisHost,
                        port=RedisPort,
                        password=RedisPassword,
                        db=RedisDB)
     self.item = {}
import re
from concurrent.futures.thread import ThreadPoolExecutor
from urllib import parse

import pymysql

from Func.Parse import FETCH

from Func.Redis import REDIS
from Func.conf import *
import json

redis = REDIS(host=RedisHost, port=RedisPort, password=RedisPassword, db=qcc1)
s = FETCH()


class get_json(object):
    def __init__(self):
        self.item = {}
        self.item1 = {}
        self.dic = {
            '200': '数据抓取成功',
            '201': '未找到匹配的公司名',
            '202': '网站信息抓取失败',
            '203': '抓取成功,信息存储失败'
        }
        self.code200 = '200'
        self.code201 = '201'
        self.code202 = '202'
        self.code203 = '203'
Пример #3
0
class get_json(object):
    def __init__(self):
        self.db = MongoDB('172.16.74.249:27017', 'db_reptile_company',
                          'company_name')
        self.redis = REDIS(host=RedisHost,
                           port=RedisPort,
                           password=RedisPassword,
                           db=RedisDB)
        self.item = {}

    # mongdb--redeis
    def transfer(self):
        dd = self.db.mongo_find({})
        for i in dd:
            item = {}
            item['_id'] = i['_id']
            item['company_name'] = i['company_name']
            b = self.redis.add('coampanylidt', json.dumps(item))
            print('存入成功', b, item)

    # 百度企业信用基本信息
    def get_companydetails(self, company_name):
        res1 = s.fetch('https://xin.baidu.com/s?q={}&t=0'.format(
            parse.quote(company_name)))
        href_list = re.findall(r'{"pid":"(\S+)","entName":', res1.text)
        if len(href_list) != 0:
            details_href = 'https://xin.baidu.com//detail//compinfo?pid=' + href_list[
                0]
            # company = res1.html.xpath('//a[@class="zx-list-item-url"]/@title')[0]
            print(details_href)
            res = s.fetch(details_href)
            # print(res.text)
            exit()
            # 统一社会信用代码
            self.item['credit_code'] = res.html.xpath(
                '//td[contains(text(),"统一社会信用代码")]/following-sibling::td[1]/text()',
                first=True)
            # 客户公司注册时间
            self.item['register_time'] = res.html.xpath(
                '//*[@class="zx-detail-basic-table"]//td[contains(text(),"成立日期")]/following-sibling::td[1]/text()',
                first=True)
            # 客户公司注册金额 registerMoney
            self.item['register_money'] = res.html.xpath(
                '//td[contains(text(),"注册资本")]/following-sibling::td[1]/text()',
                first=True)
            # 客户所属行业
            self.item['industry'] = res.html.xpath(
                '//td[contains(text(),"所属行业")]/following-sibling::td[1]/text()',
                first=True)
            # 客户公司状态:正常/注销
            self.item['business_state'] = res.html.xpath(
                '//td[contains(text(),"经营状态")]/following-sibling::td[1]/text()',
                first=True)
            # 组织机构代码
            self.item['organization_code'] = res.html.xpath(
                '//td[contains(text(),"组织机构代码")]/following-sibling::td[1]/text()',
                first=True)
            # 工商注册号
            self.item['register_num'] = res.html.xpath(
                '//td[contains(text(),"工商注册号")]/following-sibling::td[1]/text()',
                first=True)
            # 法定代表人
            self.item['legal_man'] = res.html.xpath(
                '//td[contains(text(),"法定代表人")]/following-sibling::td[1]/text()',
                first=True)
            # 登记机关
            self.item['regist_organ'] = res.html.xpath(
                '//td[contains(text(),"登记机关")]/following-sibling::td[1]/text()',
                first=True)
            # 核准日期
            self.item['confirmtime'] = res.html.xpath(
                '//*[@class="zx-detail-basic-table"]//td[contains(text(),"审核/年检日期")]/following-sibling::td[1]/text()',
                first=True)
            # 营业期限
            self.item['business_timeout'] = res.html.xpath(
                '//*[@class="zx-detail-basic-table"]//td[contains(text(),"营业期限")]/following-sibling::td[1]/text()',
                first=True)
            # 企业类型
            self.item['register_address'] = res.html.xpath(
                '//*[@class="zx-detail-basic-table"]//td[contains(text(),"企业类型")]/following-sibling::td[1]/text()',
                first=True)
            # 企业地址
            self.item['registerAddress'] = res.html.xpath(
                '//*[@class="zx-detail-basic-table"]//td[contains(text(),"注册地址")]/following-sibling::td[1]/text()',
                first=True)
            # 经营范围
            self.item['business_scope'] = res.html.xpath(
                '//td[contains(text(),"经营范围")]/following-sibling::td[1]//@data-content',
                first=True)

            self.item['usedName'] = res.html.xpath(
                '//td[contains(text(),"曾用名")]/following-sibling::td[1]/text()',
                first=True)
            # 经营方式
            # self.item['operation'] = None
            # 来源网站
            self.item['web_source'] = 'https://xin.baidu.com/'
            # 公司名
            self.item['company_name'] = company_name
            # 来源网址
            self.item['company_url'] = details_href
            self.item['_id'] = hashlib.md5(
                (company_name).encode(encoding='utf-8')).hexdigest()
            self.item['web_update_time'] = time.strftime(
                "%Y-%m-%d", time.localtime(int(time.time())))
            # print(self.item)
            # return self.item
            # code 201
            if company_name != company and len(self.item) > 4:
                self.db.mong_find_one_update({"_id": self.item['_id']},
                                             {"flag": "公司名有问题"})
                return '公司名有问题 --- %s' % self.item['company_name']
            else:
                db1.mongo_add(self.item)
                return '%s 插入成功 !!!!' % self.item['company_name']

        else:
            _id = hashlib.md5(
                (company_name).encode(encoding='utf-8')).hexdigest()
            self.db.mong_find_one_update({"_id": _id}, {"flag": "未找到匹配的公司名"})
            return '未找到匹配的公司名---%s' % company_name