Пример #1
0
    def __init__(self):
        self._can_use_proxy_num = 0
        self.is_debug = False
        if self.is_debug:
            Spider.__init__(self, 1)
        else:
            self.proxies_dict = []
            self.read_proxy("proxy_032512.txt")
            Spider.__init__(self, len(self.proxies_dict))

        self._aes_ = CCIQ_AES()
        #成功的
        self.query_success = FileSaver("c_query_detail.txt")
        #失败的
        self.query_failure = FileSaver("c_query_detail_failure.txt")
        #已经爬取过的
        self.already_cname_list = FileSaver("c_already_detail.txt")
        #初始化已经爬过的公司
        self.init_cname()

        #self.extJson = self._aes_.encrypt(spider.util.utf8str({"cl_screenSize": "640x960", "cl_cookieId": "B200BA9D-A3A0-4140-A293-9A1A671BA5CE", "Org_iOS_Version": "2.0.1"}))
        # self.extJson = "Hoi6oX70l9whauZmjq8jVAmoe3UspXXhX9mPG+KAeqs1rKZVr/uapICH92P/Crryt63u28aP4QP665AzcT/jN5Go1o3bvwMvVIkuN9e60k6WI2pVFBrwZMvxwW6BnQukSzDSlyPvEhgpR5DIHQEV6C51hMgp4Zc3OkTSsyezAm4="
        # self.select_user_agent("=CCIQ/2.0.1 (iPhone; iOS 9.1; Scale/2.00)")
        self.bloom = set()

        self.extJsons = ["Hoi6oX70l9whauZmjq8jVAmoe3UspXXhX9mPG+KAeqs1rKZVr/uapICH92P/Crryt63u28aP4QP665AzcT/jN5Go1o3bvwMvVIkuN9e60k6WI2pVFBrwZMvxwW6BnQukSzDSlyPvEhgpR5DIHQEV6C51hMgp4Zc3OkTSsyezAm4=",
                         "ctlCXDvoyaH2pCIArrgvXp7zrZTzpz2Q5rukh+aWvupEFABw6P2AvbmaN+HJ7IZgDJ/kgBkJt/rLppSGitYCPKGR2IGv6OXZsrJGgbRB3G3Ac4K8xpX3aMB5s8Ci2a/YpTpioZxAvptqJsQUCoNn0tLCOVM4XxMJQWbrErkOcl4=",
                         "ctlCXDvoyaH2pCIArrgvXp7zrZTzpz2Q5rukh+aWvupEFABw6P2AvbmaN+HJ7IZgDJ/kgBkJt/rLppSGitYCPKGR2IGv6OXZsrJGgbRB3G1U2wdOlL49/aDwt3NZNp4TGa5iBFpYLm69F/6PPFoXIR/Aw5p48//8OgZFpddDUwQ="]

        self.user_agents = ["=CCIQ/2.0.1 (iPhone; iOS 9.1; Scale/2.00)",
                            "=CCIQ/2.0.1 (iPhone; iOS 8.1.3; Scale/2.00)",
                            "=CCIQ/2.0.1 (iPhone; iOS 8.4; Scale/2.00)"]
        self.is_first = True
        self.init_time = 0
Пример #2
0
    def __init__(self):
        self.proxies_dict = []
        self.read_proxy("proxy_20160218.txt")
        Spider.__init__(self, len(self.proxies_dict))

        self.num_count = 0
        #self.filter_name = []
        self._aes_ = CCIQ_AES()
        #根据公司名字查询到的公司列表全部信息
        self.query_company_info = FileSaver("query_company_info.txt")
        #根据公司名字查询到的公司列表局部信息
        self.query_company_info_part = FileSaver("query_company_info_part.txt")
        #根据公司名字查询到的公司列表信息失败的
        self.query_company_info_failure = FileSaver(
            "query_company_info_failure.txt")
        #已经爬取过的公司名
        self.already_cname = FileSaver("already_cname.txt")
        #初始化已经爬过的公司
        self.init_cname()
        #查询详情失败的公司名
        self.detail_failure = FileSaver("detail_failure1.txt")
        #APP可以拿到的公司全部信息 包含股东信息
        self.detail_company = FileSaver("detail_company.txt")
        self.extJson = self._aes_.encrypt(
            spider.util.utf8str({
                "cl_screenSize": "640x960",
                "cl_cookieId": "16923697-D73E-485A-BDCF-68FAD456AC02",
                "Org_iOS_Version": "2.0.1"
            }))
        self.select_user_agent("=CCIQ/2.0.1 (iPhone; iOS 9.1; Scale/2.00)")
Пример #3
0
    def __init__(self):
        self.is_debug = False
        self._can_use_proxy_num = 0
        if self.is_debug:
            Spider.__init__(self, 1)
        else:
            self.proxies_dict = []
            self.read_proxy("../../_ct_proxy/proxy_all_filter.txt")
            Spider.__init__(self, len(self.proxies_dict))
        self.error_cnt = 0
        self._aes_ = CCIQ_AES()
        #根据公司名字查询到的公司列表全部信息
        self.query_company_list = FileSaver("all_company_list.txt")

        #已经爬取过的公司名
        self.already_cname_list = FileSaver("all_company_list_already.txt")

        #爬过的 错误类型
        self.already_error_type = FileSaver("all_already_error_type.txt")

        #初始化已经爬过的公司
        self.init_cname()
        self.extJsons = [
            "Hoi6oX70l9whauZmjq8jVAmoe3UspXXhX9mPG+KAeqs1rKZVr/uapICH92P/Crryt63u28aP4QP665AzcT/jN5Go1o3bvwMvVIkuN9e60k6WI2pVFBrwZMvxwW6BnQukSzDSlyPvEhgpR5DIHQEV6C51hMgp4Zc3OkTSsyezAm4=",
            "ctlCXDvoyaH2pCIArrgvXp7zrZTzpz2Q5rukh+aWvupEFABw6P2AvbmaN+HJ7IZgDJ/kgBkJt/rLppSGitYCPKGR2IGv6OXZsrJGgbRB3G3Ac4K8xpX3aMB5s8Ci2a/YpTpioZxAvptqJsQUCoNn0tLCOVM4XxMJQWbrErkOcl4=",
            "ctlCXDvoyaH2pCIArrgvXp7zrZTzpz2Q5rukh+aWvupEFABw6P2AvbmaN+HJ7IZgDJ/kgBkJt/rLppSGitYCPKGR2IGv6OXZsrJGgbRB3G1U2wdOlL49/aDwt3NZNp4TGa5iBFpYLm69F/6PPFoXIR/Aw5p48//8OgZFpddDUwQ="
        ]

        self.user_agents = [
            "=CCIQ/2.0.1 (iPhone; iOS 9.1; Scale/2.00)",
            "=CCIQ/2.0.1 (iPhone; iOS 8.1.3; Scale/2.00)",
            "=CCIQ/2.0.1 (iPhone; iOS 8.4; Scale/2.00)"
        ]

        self.bloom = set()
Пример #4
0
 def __init__(self):
     #self.proxies_dict = []
     #self.read_proxy("../spider/proxy/proxy.txt")
     #Spider.__init__(self, len(self.proxies_dict))
     Spider.__init__(self, 1)
     self.num_count = 0
     self._aes_ = CCIQ_AES()
     #APP可以拿到的公司全部信息
     self.save_success = FileSaver("exist_company.txt")
     #APP可以拿到的公司局部信息
     self.part_success = FileSaver("part_company.txt")
     #查询失败的公司名
     self.fail_name = FileSaver("fail_name.txt")
Пример #5
0
    def get_detail(self, cname, code, area):
        url = "http://appsvc.qiye.qianzhan.com/OrgCompany.svc/orgcompany/combine/detail"
        headers = {"Content-Type": "application/json"}
        encryptedJson = {
            "bl_oc_code": code,  #"71526726X"
            "v1": "QZOrgV004",
            "isDirect": "1",
            "bl_oc_name": cname,  #"腾讯科技"
            "bl_oc_area": area  #"4403"
        }

        param = {
            "encryptedJson":
            self._aes_.encrypt(spider.util.utf8str(encryptedJson)),
            "extJson": self.extJson
        }
        param = spider.util.utf8str(param)
        res = self.request_url(url,
                               headers=headers,
                               data=param,
                               proxies=self.proxies_dict[self.get_tid()])

        if res is None:
            print 'res is none -- encryptedJson -->', str(encryptedJson)
            self.detail_failure.append(cname + "|" + str(code) + "|" +
                                       str(area))
            return
        elif res.code == 404:
            print "404 ------ ", code
            self.detail_failure.append(cname + "|" + str(code) + "|" +
                                       str(area))
            return
        elif res.code == 503 or res.code == 500 or res.code == 502 or res.code == 504:
            print res.code, '------', code
            time.sleep(0.5)
            self.get_detail(cname, code, area)
            return
        elif res.code == 200:
            c = eval(res.text)['c']
            if len(c) == 0:
                print '-----------------------------code ', code, ' res.text is null----------------------------'
                self.detail_failure.append(cname + "|" + str(code) + "|" +
                                           str(area))
                return
            result = CCIQ_AES("BB1856A312580D41256311147089E0CC").decrypt(c)
            detail = eval(result)
            listGD = self.get_gd(area, code)
            if listGD is not None:
                detail['listGD'] = listGD['listGD']
            print 'detail=================================', spider.util.utf8str(
                detail)
            self.detail_company.append(spider.util.utf8str(detail))
            return
        else:
            print "cname %s #######################################UNKNOWN ERROR############################################# [ %d ]" % (
                cname, res.code)
Пример #6
0
    def get_branch(self,cname, now_page=1, list_branch=[], retry=0):
        """
        查询分支机构
        """
        url = "http://appsvc.qiye.qianzhan.com/OrgCompany.svc/orgcompany/branch/select/page"
        encryptedJson = {
            "companyName" : cname,
            "v1" : "QZOrgV005",
            "page" : now_page,
            "pagesize" : "10"
        }

        res = self.req_all(url, encryptedJson)
        if res is None:
            return None
        if res.code == 200:
            try:
                c = eval(res.text)['c']
                if len(c) == 0:
                    print "get_branch --- cname=%s --- retry=%d --- reason:len(c)=0" % (cname, retry)
                    return None
                result = CCIQ_AES("BF1856A312580D41256311147089E1CC").decrypt(c)
                temp = eval(result)
                if temp is not None:
                    for t in temp['Branch']:
                        list_branch.append(t)
                    if len(temp['Branch']) == 10:
                        if now_page > 3:
                            return list_branch
                        now_page += 1
                        print cname, "翻页 -----------------------------------> now_page", now_page
                        return self.get_branch(cname, now_page=now_page, list_branch=list_branch, retry=retry)
                    else:
                        return list_branch
                else:
                    print "get_branch --- cname=%s --- retry=%d --- now_page=%d --- res.code=%d --- Branch is NULL" % (cname, retry, now_page)
                    return None
            except Exception as err:
                print "get_branch --- cname=%s --- retry=%d --- reason:%s" % (cname, retry, err)
                if retry < 5:
                    retry += 1
                    time.sleep(retry*1.5)
                    return self.get_branch(cname, now_page=now_page, list_branch=list_branch, retry=retry)
                else:
                    return None
        else:
            print "get_branch --- cname=%s --- retry=%d --- res.code=%d" % (cname, retry, res.code)
            if retry < 5:
                retry += 1
                time.sleep(retry*1.5)
                return self.get_branch(cname, now_page=now_page, list_branch=list_branch, retry=retry)
            else:
                return None
Пример #7
0
    def get_branch(self, cname, now_page, list_branch):
        """
        查询分支机构
        """
        url = "http://appsvc.qiye.qianzhan.com/OrgCompany.svc/orgcompany/branch/select/page"
        encryptedJson = {
            "companyName": cname,
            "v1": "QZOrgV004",
            "page": now_page,
            "pagesize": "10"
        }
        param = spider.util.utf8str({
            "encryptedJson":
            self._aes_.encrypt(spider.util.utf8str(encryptedJson)),
            "extJson":
            self.extJson
        })
        res = self.request_url(url,
                               headers=self.headers,
                               data=param,
                               proxies=self.proxies_dict[self.get_tid()])

        if res is None:
            print 'get_branch ------ res is none ---->', cname, now_page
            return None
        elif res.code == 404:
            print "get_branch ------ 404 --- ", cname, now_page
            return None
        elif res.code == 503 or res.code == 500 or res.code == 502 or res.code == 504:
            print 'get_branch ------ ', res.code, cname, now_page
            time.sleep(0.5)
            return self.get_branch(cname, now_page)
        elif res.code == 200:
            c = eval(res.text)['c']
            if len(c) == 0:
                print 'get_branch------res.text is null----------------------------', cname, now_page
                return None
            result = CCIQ_AES("BB1856A312580D41256311147089E0CC").decrypt(c)
            temp = eval(result)
            if temp is not None:
                for t in temp['Branch']:
                    list_branch['Branch'].append(t)
                if len(temp['Branch']) == 10:
                    now_page += 1
                    return self.get_branch(cname, now_page, list_branch)
                else:
                    return list_branch
            else:
                print 'get_branch------Branch is null----------------------------', cname, now_page
                return None
        else:
            print cname, "######## get_branch  ################   UNKNOWN ERROR   ######################", res.code
        return None
Пример #8
0
    def __init__(self):
        Spider.__init__(self, 20)
        self._aes_ = CCIQ_AES()

        #self.select_user_agent("=CCIQ/2.0.1 (iPhone; iOS 8.4; Scale/2.00)")
        self.proxy_filter = FileSaver("proxy_filter_030309_detail1.txt")


        self.extJsons = ['"Hoi6oX70l9whauZmjq8jVAmoe3UspXXhX9mPG+KAeqs1rKZVr\/uapICH92P\/Crryt63u28aP4QP665AzcT\/jN5Go1o3bvwMvVIkuN9e60k6WI2pVFBrwZMvxwW6BnQukSzDSlyPvEhgpR5DIHQEV6C51hMgp4Zc3OkTSsyezAm4="',
                         '"ctlCXDvoyaH2pCIArrgvXp7zrZTzpz2Q5rukh+aWvupEFABw6P2AvbmaN+HJ7IZgDJ\/kgBkJt\/rLppSGitYCPKGR2IGv6OXZsrJGgbRB3G3Ac4K8xpX3aMB5s8Ci2a\/YpTpioZxAvptqJsQUCoNn0tLCOVM4XxMJQWbrErkOcl4="',
                         '"ctlCXDvoyaH2pCIArrgvXp7zrZTzpz2Q5rukh+aWvupEFABw6P2AvbmaN+HJ7IZgDJ\/kgBkJt\/rLppSGitYCPKGR2IGv6OXZsrJGgbRB3G1U2wdOlL49\/aDwt3NZNp4TGa5iBFpYLm69F\/6PPFoXIR\/Aw5p48\/\/8OgZFpddDUwQ="']

        self.user_agents = ["=CCIQ/2.0.1 (iPhone; iOS 9.1; Scale/2.00)",
                            "=CCIQ/2.0.1 (iPhone; iOS 8.1.3; Scale/2.00)",
                            "=CCIQ/2.0.1 (iPhone; iOS 8.4; Scale/2.00)"]
Пример #9
0
    def get_gd(self, area, code):
        """
        获取股东信息
        """
        url = "http://appsvc.qiye.qianzhan.com/OrgCompany.svc/orgcompany/gd/detail"

        encryptedJson = {
            "bl_oc_area": area,  #4107
            "v1": "QZOrgV004",
            "bl_oc_code": code  #672867774
        }

        param = spider.util.utf8str({
            "encryptedJson":
            self._aes_.encrypt(spider.util.utf8str(encryptedJson)),
            "extJson":
            self.extJson
        })
        res = self.request_url(url,
                               headers=self.headers,
                               data=param,
                               proxies=self.proxies_dict[self.get_tid()])

        if res is None:
            print 'get_gd ------ res is none -- get_gd code is -->', code
            return None
        elif res.code == 404:
            print "get_gd ------ 404 ------ ", code
            return None
        elif res.code == 503 or res.code == 500 or res.code == 502 or res.code == 504:
            print 'get_gd ------ ', res.code, code
            time.sleep(0.5)
            return self.get_gd(area, code)
        elif res.code == 200:
            c = eval(res.text)['c']
            if len(c) == 0:
                print 'get_gd ------', code, ' res.text is null----------------------------'
                return None
            result = CCIQ_AES("BB1856A312580D41256311147089E0CC").decrypt(c)
            list_gd = eval(result)
            return list_gd
        else:
            print code, "#######################################UNKNOWN ERROR#############################################", res.code
        return None
Пример #10
0
    def get_inversted(self, url, encryptedJson):
        """
        通用请求方法
        """

        param = spider.util.utf8str({
            "encryptedJson":
            self._aes_.encrypt(spider.util.utf8str(encryptedJson)),
            "extJson":
            self.extJson
        })

        res = self.request_url(url,
                               headers={"Content-Type": "application/json"},
                               data=param,
                               proxies=self.proxies_dict[self.get_tid()])

        if res is None:
            print 'res is none -- search gd code is -->', code
            return None
        elif res.code == 404:
            print "404 ------ ", code
            return None
        elif res.code == 503 or res.code == 500 or res.code == 502 or res.code == 504:
            print res.code, '------', code
            time.sleep(0.5)
            return self.get_gd(area, code)
        elif res.code == 200:
            c = eval(res.text)['c']
            if len(c) == 0:
                print '-----------------------------gd code', code, ' res.text is null----------------------------' % cname
                return None
            result = CCIQ_AES("BB1856A312580D41256311147089E0CC").decrypt(c)
            list_gd = eval(result)
            #print 'gd infos =======================',spider.util.utf8str(list_gd)
            return list_gd
        else:
            print code, "#######################################UNKNOWN ERROR#############################################", res.code
        return None
Пример #11
0
    def get_inversted(self, cname):
        """
        查询投资信息
        """
        url = "http://appsvc.qiye.qianzhan.com/OrgCompany.svc/orgcompany/map/invesment"
        headers = {"Content-Type": "application/json"}
        encryptedJson = {"input": cname, "v1": "QZOrgV004"}
        param = {
            "encryptedJson":
            self._aes_.encrypt(spider.util.utf8str(encryptedJson)),
            "extJson": self.extJson
        }
        param = spider.util.utf8str(param)
        res = self.request_url(url,
                               headers=headers,
                               data=param,
                               proxies=self.proxies_dict[self.get_tid()])

        if res is None:
            print 'get_inversted ------ res is none --', cname
            return None
        elif res.code == 404:
            print "get_inversted ------ 404 --- ", cname
            return None
        elif res.code == 503 or res.code == 500 or res.code == 502 or res.code == 504:
            print 'get_inversted ------ ', res.code, cname
            time.sleep(0.5)
            return self.get_inversted(cname)
        elif res.code == 200:
            c = eval(res.text)['c']
            if len(c) == 0:
                print 'get_inversted ------ ', cname, ' res.text is null----------------------------'
                return None
            result = CCIQ_AES("BB1856A312580D41256311147089E0CC").decrypt(c)
            list_inversted = eval(result)
            return list_inversted
        else:
            print cname, "##############  get_inversted  ############   UNKNOWN ERROR   #################", res.code
        return None
Пример #12
0
 def get_gd(self, code, retry=0):
     """
     获取股东信息
     """
     url = "http://appsvc.qiye.qianzhan.com/OrgCompany.svc/orgcompany/gd/detail"
     encryptedJson = {
         "bl_oc_area": "",
         "v1": "QZOrgV005",
         "bl_oc_code": code
     }
     res = self.req_all(url, encryptedJson)
     if res is None:
         return None
     if res.code == 200:
         try:
             c = eval(res.text)['c']
             if len(c) == 0:
                 print "get_gd --- retry=%d --- reason:len(c)=0" % retry
                 return None
             result = CCIQ_AES("BF1856A312580D41256311147089E1CC").decrypt(
                 c)
             #print "获取股东信息结果:", spider.util.utf8str(result)
             return eval(result)
         except Exception as err:
             print "get_gd --- retry=%d --- reason:%s" % (retry, err)
             if retry < 5:
                 retry += 1
                 time.sleep(retry * 1.5)
                 return self.get_gd(code, retry=retry)
             else:
                 return None
     else:
         print "get_gd --- retry=%d --- res.code=%d" % (retry, res.code)
         if retry < 5:
             retry += 1
             time.sleep(retry * 1.5)
             return self.get_gd(code, retry=retry)
         else:
             return None
Пример #13
0
    def get_inversted(self, cname, retry=0):
        """
        查询投资信息
        """
        url = "http://appsvc.qiye.qianzhan.com/OrgCompany.svc/orgcompany/map/invesment"
        encryptedJson = {"input": cname, "v1": "QZOrgV005"}

        res = self.req_all(url, encryptedJson)
        if res is None:
            return None
        if res.code == 200:
            try:
                c = eval(res.text)['c']
                if len(c) == 0:
                    print "get_inversted --- cname=%s --- retry=%d --- reason:len(c)=0" % (
                        cname, retry)
                    return None
                result = CCIQ_AES("BF1856A312580D41256311147089E1CC").decrypt(
                    c)
                return eval(result)
            except Exception as err:
                print "get_inversted --- cname=%s --- retry=%d --- reason:%s" % (
                    cname, retry, err)
                if retry < 5:
                    retry += 1
                    time.sleep(retry * 1.5)
                    return self.get_inversted(cname, retry=retry)
                else:
                    return None
        else:
            print "get_inversted --- cname=%s --- retry=%d --- res.code=%d" % (
                cname, retry, res.code)
            if retry < 5:
                retry += 1
                time.sleep(retry * 1.5)
                return self.get_inversted(cname, retry=retry)
            else:
                return None
Пример #14
0
    def get_branch(self,cname, now_page, list_branch, retry):
        """
        查询分支机构
        """
        url = "http://appsvc.qiye.qianzhan.com/OrgCompany.svc/orgcompany/branch/select/page"
        encryptedJson = {
            "companyName" : cname,
            "v1" : "QZOrgV005",
            "page" : now_page,
            "pagesize" : "10"
        }

        res = self.req_all(url, encryptedJson)
        res_code = 0
        if res is None or (res.code >= 400 and res.code < 500):
            if res is not None:
                res_code = res.code
            print "get_branch --- cname=%s --- retry=%d --- now_page=%d --- res.code=%d" % (cname, retry, now_page , res_code)
            if retry < 5:
                time.sleep(0.1)
                return self.get_branch(cname,now_page, list_branch, (retry+1))
            else:
                return None

        res_code = res.code
        if res_code >= 500:
            print "get_branch --- cname=%s --- retry=%d --- now_page=%d --- res.code=%d" % (cname, retry, now_page , res_code)
            time.sleep(1)
            return self.get_branch(cname, now_page, list_branch, (retry+1))
        elif res_code == 200:
            try:
                c = eval(res.text)['c']
            except Exception as err:
                print "get_branch --- cname=%s --- retry=%d --- now_page=%d --- res.code=%d" % (cname, retry, now_page , res_code)
                print "get_branch --- exception res.text:\n", res.text
                if retry < 5:
                    time.sleep(0.1)
                    return self.get_branch(cname, now_page, list_branch, (retry+1))
                else:
                    return None
            if len(c) == 0:
                print "get_branch --- cname=%s --- retry=%d --- now_page=%d --- res.code=%d --- len(c)=0" % (cname, retry, now_page , res_code)
                if retry < 5:
                    time.sleep(0.1)
                    return self.get_branch(cname, now_page, list_branch, (retry+1))
                else:
                    return None
            result = CCIQ_AES("BF1856A312580D41256311147089E1CC").decrypt(c)
            temp = eval(result)
            if temp is not None:
                for t in temp['Branch']:
                    list_branch['Branch'].append(t)
                if len(temp['Branch']) == 10:
                    now_page += 1
                    # if now_page >= 10:
                    #     return list_branch
                    return self.get_branch(cname, now_page, list_branch, 0)
                else:
                    return list_branch
            else:
                print "get_branch --- cname=%s --- retry=%d --- now_page=%d --- res.code=%d --- Branch is NULL" % (cname, retry, now_page , res_code)
                return None
        else:
            print "get_branch --- cname=%s --- retry=%d --- now_page=%d --- res.code=%d --- UNKNOW ERROR" % (cname, retry, now_page , res_code)
            if retry < 5:
                time.sleep(1)
                return self.get_branch(cname, now_page, list_branch, (retry+1))
            else:
                return None
Пример #15
0
    def flip_over(self, now_page, cname, cnt, retry):
        tid = self.get_tid()
        """
        根据公司名查询公司列表,翻页
        """
        encryptedJson = {
            "pagesize": "20",
            "page": now_page,
            "od_orderBy": "0",
            "sh_searchType": "一般搜索",
            "sh_oc_areaName": "",
            "od_statusFilter": "0",
            "v1": "QZOrgV005",
            "oc_name": cname,
            "sh_u_uid": "",
            "sh_u_name": ""
        }
        r_result = {"cname": cname}
        res = self.req_all(encryptedJson)
        res_code = 0
        if res is None:
            if self.get_fail_cnt('failcount-none', 1) < 10:
                self.re_add_job({'cname': cname, 'cnt': cnt, 'retry': retry})
                print "tid=%d --- cnt=%d --- cname=%s --- retry=%d --- res.code=%d --- now_page:%d" % (
                    tid, cnt, cname, retry, res_code, now_page)
                return
            else:
                # if retry > 5:
                #     r_result["type"] = "None"
                #     self.already_error_type.append(spider.util.utf8str(r_result))
                #     self.record_spider(cname)
                #     print "tid=%d --- cnt=%d --- cname=%s --- retry=%d --- res.code=%d --- now_page:%d" % (tid, cnt, cname, retry, res_code, now_page)
                # else:
                #     self.re_add_job({'cname':cname,'cnt':cnt, 'retry':(retry+1)})
                self.re_add_job({'cname': cname, 'cnt': cnt, 'retry': retry})
                raise AccountErrors.NoAccountError(
                    "Maybe the proxy invalid,failcount-none = [ %d ],tid=[ %d ]"
                    % (self.get_fail_cnt('failcount-none', 0), tid))
        else:
            setattr(self._curltls, 'failcount-none', 0)

        res_code = res.code

        if (res_code >= 400 and res_code < 500) or res_code == 202:
            if self.get_fail_cnt('failcount-400', 1) < 5:
                self.re_add_job({'cname': cname, 'cnt': cnt, 'retry': retry})
                print "tid=%d --- cnt=%d --- cname=%s --- retry=%d --- res.code=%d --- now_page:%d" % (
                    tid, cnt, cname, retry, res_code, now_page)
                return
            else:
                if retry > 5:
                    r_result["type"] = "400+"
                    self.already_error_type.append(
                        spider.util.utf8str(r_result))
                    self.record_spider(cname)
                else:
                    self.re_add_job({
                        'cname': cname,
                        'cnt': cnt,
                        'retry': (retry + 1)
                    })
                    self._can_use_proxy_num -= 1
                raise AccountErrors.NoAccountError(
                    "Maybe the proxy invalid,failcount-400 = [ %d ],tid=[ %d ]"
                    % (self.get_fail_cnt('failcount-400', 0), tid))
        else:
            setattr(self._curltls, 'failcount-400', 0)

        if res_code >= 500:
            # if retry > 2:
            #     r_result["type"]="500"
            #     self.already_error_type.append(spider.util.utf8str(r_result))
            #     self.record_spider(cname)
            # else:
            self.re_add_job({'cname': cname, 'cnt': cnt, 'retry': retry})
            print "tid=%d --- cnt=%d --- cname=%s --- retry=%d --- res.code=%d  --- now_page:%d " % (
                tid, cnt, cname, retry, res_code, now_page)
            time.sleep(random.randrange(1, 10, 1))
            return
        elif res_code == 200:
            try:
                c = eval(res.text)['c']
            except Exception as err:
                print "tid=%d --- cnt=%d --- cname=%s --- retry=%d --- res.code=%d  --- now_page:%d --- exception res.text - %s" % (
                    tid, cnt, cname, retry, res_code, now_page, err)
                # r_result["type"] = "res_error"
                # self.already_error_type.append(spider.util.utf8str(r_result))
                # self.record_spider(cname)
                # self.error_cnt += 1
                self.re_add_job({'cname': cname, 'cnt': cnt, 'retry': retry})
                return
            if len(c) == 0:
                print "tid=%d --- cnt=%d --- cname=%s --- retry=%d --- res.code=%d  --- now_page:%d --- exception 'C' IS NULL" % (
                    tid, cnt, cname, retry, res_code, now_page)
                r_result["type"] = "c=0"
                self.already_error_type.append(spider.util.utf8str(r_result))
                self.record_spider(cname)
                self.error_cnt += 1
                return
            result = CCIQ_AES("BF1856A312580D41256311147089E1CC").decrypt(c)
            try:
                dic = eval(result)
            except Exception as err:
                print "tid=%d --- cnt=%d --- cname=%s --- retry=%d --- res.code=%d  --- now_page:%d --- exception result:%s" % (
                    tid, cnt, cname, retry, res_code, now_page, result)
                r_result["type"] = "result_error"
                self.already_error_type.append(spider.util.utf8str(r_result))
                self.record_spider(cname)
                self.error_cnt += 1
                return
            list = dic['list']
            if len(list) == 0:
                print "tid=%d --- cnt=%d --- cname=%s --- retry=%d --- res.code=%d --- now_page:%d --- exception len(list)=0" % (
                    tid, cnt, cname, retry, res_code, now_page)
                r_result["type"] = "list=0"
                self.already_error_type.append(spider.util.utf8str(r_result))
                self.record_spider(cname)
                self.error_cnt += 1
                return
            #print "tid=%d ### cnt=%d ### cname=%s ### retry=%d ### res.code=%d ### now_page:%d ### success:len(list):%d " % (tid, cnt, cname, retry, res_code, now_page, len(list))
            for l in list:
                aa = {"query_name": cname}
                for k, v in l.items():
                    aa[k] = v
                self.query_company_list.append(spider.util.utf8str(aa))
            print "******", len(list), spider.util.utf8str(list)
            if len(list) < 20:
                # r_result["type"] = "success"
                # self.already_error_type.append(spider.util.utf8str(r_result))
                self.record_spider(cname)
                return
            elif len(list) == 20:
                if now_page > 100:
                    self.already_error_type.append(
                        spider.util.utf8str(r_result))
                    self.record_spider(cname)
                    return
                now_page += 1
                self.flip_over(now_page, cname, cnt, retry)
        else:
            print "tid=%d --- cnt=%d --- cname=%s --- retry=%d --- res.code=%d --- now_page:%d --- exception UNKNOW ERROR" % (
                tid, cnt, cname, retry, res_code, now_page)
            if retry < 3:
                self.re_add_job({
                    'cname': cname,
                    'cnt': cnt,
                    'retry': (retry + 1)
                })
                return
            r_result["type"] = "unknown_error"
            self.already_error_type.append(spider.util.utf8str(r_result))
            self.record_spider(cname)
            return
Пример #16
0
 def get_gd(self, area, code, cname, retry):
     """
     获取股东信息
     """
     url = "http://appsvc.qiye.qianzhan.com/OrgCompany.svc/orgcompany/gd/detail"
     encryptedJson = {
         "bl_oc_area" : area, #4107
         "v1" : "QZOrgV005",
         "bl_oc_code" : code #672867774
     }
     res = self.req_all(url, encryptedJson)
     res_code = 0
     if res is None or (res.code >= 400 and res.code < 500):
         if res is not None:
             res_code = res.code
         print "get_gd --- cname=%s --- retry=%d --- res.code=%d" % (cname, retry, res_code)
         if retry < 5:
             time.sleep(0.1)
             return self.get_gd(area, code, cname, (retry+1))
         else:
             return None
     res_code = res.code
     if res_code >= 500:
         print "get_gd --- cname=%s --- retry=%d --- res.code=%d" % (cname, retry, res_code)
         time.sleep(1)
         return self.get_gd(area, code, cname, retry)
     elif res_code == 200:
         try:
             c = eval(res.text)['c']
         except Exception as err:
             print "get_gd --- cname=%s --- retry=%d --- res.code=%d  " % (cname, retry, res_code)
             print "get_gd --- exception res.text:\n", res.text
             if retry < 5:
                 time.sleep(0.1)
                 return self.get_gd(area, code, cname, (retry+1))
             else:
                 return None
         if len(c) == 0:
             print "get_gd --- cname=%s --- retry=%d --- res.code=%d  len(c)=0" % (cname, retry, res_code)
             if retry < 5:
                 time.sleep(0.1)
                 return self.get_gd(area, code, cname, (retry+1))
             else:
                 return None
         result = CCIQ_AES("BF1856A312580D41256311147089E1CC").decrypt(c)
         try:
             list_gd = eval(result)
         except Exception as err:
             print "get_gd --- cname=%s --- retry=%d --- res.code=%d " % (cname, retry, res_code)
             print 'get_gd --- eval(result) exception , result:\n',result
             if retry < 5:
                 time.sleep(0.1)
                 return self.get_gd(area, code, cname, (retry+1))
             else:
                 return None
         return list_gd
     else:
         print "get_gd --- cname=%s --- retry=%d --- res.code=%d ---UNKNOW ERROR" % (cname, retry, res_code)
         if retry < 5:
             time.sleep(0.1)
             return self.get_gd(area, code, cname, (retry+1))
         else:
             return None
Пример #17
0
    def get_inversted(self, cname, retry):
        """
        查询投资信息
        """
        url = "http://appsvc.qiye.qianzhan.com/OrgCompany.svc/orgcompany/map/invesment"
        encryptedJson = {
            "input" : cname,
            "v1" : "QZOrgV005"
        }

        res = self.req_all(url, encryptedJson)
        res_code = 0
        if res is None or (res.code >= 400 and res.code < 500):
            if res is not None:
                res_code = res.code
            print "get_inversted --- cname=%s --- retry=%d --- res.code=%d" % (cname, retry, res_code)
            if retry < 5:
                return self.get_inversted(cname, (retry+1))
            else:
                return None

        res_code = res.code
        if res_code >= 400 and res_code < 500:
            print "get_inversted --- cname=%s --- retry=%d --- res.code=%d" % (cname, retry, res_code)
            if retry < 5:
                return self.get_inversted(cname, (retry+1))
            else:
                return None
        elif res_code >= 500:
            print "get_inversted --- cname=%s --- retry=%d --- res.code=%d" % (cname, retry, res_code)
            time.sleep(1)
            return self.get_inversted(cname, retry)
        elif res.code == 200:
            try:
                c = eval(res.text)['c']
            except Exception as err:
                print "get_inversted --- cname=%s --- retry=%d --- res.code=%d" % (cname, retry, res_code)
                print "get_inversted --- exception res.text:\n", res.text
                if retry < 5:
                    time.sleep(0.1)
                    return self.get_inversted(cname, (retry+1))
                else:
                    return None
            if len(c) == 0:
                print "get_inversted --- cname=%s --- retry=%d --- res.code=%d" % (cname, retry, res_code)
                if retry < 5:
                    time.sleep(0.1)
                    return self.get_inversted(cname, (retry+1))
                else:
                    return None
            result = CCIQ_AES("BF1856A312580D41256311147089E1CC").decrypt(c)
            try:
                list_inversted = eval(result)
            except Exception as err:
                print "get_inversted --- cname=%s --- retry=%d --- res.code=%d" % (cname, retry, res_code)
                print 'get_inversted --- eval(result) exception , result:\n', result
                if retry < 5:
                    time.sleep(0.1)
                    return self.get_inversted(cname, (retry+1))
                else:
                    return None
            return list_inversted
        else:
            print "get_inversted --- cname=%s --- retry=%d --- res.code=%d ---UNKNOW ERROR" % (cname, retry, res_code)
            if retry < 5:
                time.sleep(0.1)
                return self.get_inversted(cname, (retry+1))
            else:
                return None
Пример #18
0
class QycxbDetail(Spider):
    """
    根据企业名称.查询企业列表 121.40.186.237:18889:ipin:helloipin
    """
    def __init__(self):
        self._can_use_proxy_num = 0
        self.is_debug = False
        if self.is_debug:
            Spider.__init__(self, 1)
        else:
            self.proxies_dict = []
            self.read_proxy("proxy_032512.txt")
            Spider.__init__(self, len(self.proxies_dict))

        self._aes_ = CCIQ_AES()
        #成功的
        self.query_success = FileSaver("c_query_detail.txt")
        #失败的
        self.query_failure = FileSaver("c_query_detail_failure.txt")
        #已经爬取过的
        self.already_cname_list = FileSaver("c_already_detail.txt")
        #初始化已经爬过的公司
        self.init_cname()

        #self.extJson = self._aes_.encrypt(spider.util.utf8str({"cl_screenSize": "640x960", "cl_cookieId": "B200BA9D-A3A0-4140-A293-9A1A671BA5CE", "Org_iOS_Version": "2.0.1"}))
        # self.extJson = "Hoi6oX70l9whauZmjq8jVAmoe3UspXXhX9mPG+KAeqs1rKZVr/uapICH92P/Crryt63u28aP4QP665AzcT/jN5Go1o3bvwMvVIkuN9e60k6WI2pVFBrwZMvxwW6BnQukSzDSlyPvEhgpR5DIHQEV6C51hMgp4Zc3OkTSsyezAm4="
        # self.select_user_agent("=CCIQ/2.0.1 (iPhone; iOS 9.1; Scale/2.00)")
        self.bloom = set()

        self.extJsons = ["Hoi6oX70l9whauZmjq8jVAmoe3UspXXhX9mPG+KAeqs1rKZVr/uapICH92P/Crryt63u28aP4QP665AzcT/jN5Go1o3bvwMvVIkuN9e60k6WI2pVFBrwZMvxwW6BnQukSzDSlyPvEhgpR5DIHQEV6C51hMgp4Zc3OkTSsyezAm4=",
                         "ctlCXDvoyaH2pCIArrgvXp7zrZTzpz2Q5rukh+aWvupEFABw6P2AvbmaN+HJ7IZgDJ/kgBkJt/rLppSGitYCPKGR2IGv6OXZsrJGgbRB3G3Ac4K8xpX3aMB5s8Ci2a/YpTpioZxAvptqJsQUCoNn0tLCOVM4XxMJQWbrErkOcl4=",
                         "ctlCXDvoyaH2pCIArrgvXp7zrZTzpz2Q5rukh+aWvupEFABw6P2AvbmaN+HJ7IZgDJ/kgBkJt/rLppSGitYCPKGR2IGv6OXZsrJGgbRB3G1U2wdOlL49/aDwt3NZNp4TGa5iBFpYLm69F/6PPFoXIR/Aw5p48//8OgZFpddDUwQ="]

        self.user_agents = ["=CCIQ/2.0.1 (iPhone; iOS 9.1; Scale/2.00)",
                            "=CCIQ/2.0.1 (iPhone; iOS 8.1.3; Scale/2.00)",
                            "=CCIQ/2.0.1 (iPhone; iOS 8.4; Scale/2.00)"]
        self.is_first = True
        self.init_time = 0

    def req_all(self, url, encryptedJson):
        #time.sleep(random.randrange(5, 11, 1))
        #time.sleep(2)
        number = random.randrange(0, 3, 1)
        self.select_user_agent(self.user_agents[number])
        param = spider.util.utf8str({"encryptedJson":self._aes_.encrypt(spider.util.utf8str(encryptedJson)), "extJson":self.extJsons[number]})
        param = param.replace('/', "\/")
        if self.is_first:
            self.init_time = time.time()
            print '初始化时间',self.init_time
            self.is_first = False
        if self.is_debug:
            res = self.request_url(url, headers={"Content-Type": "application/json"}, data=param, proxies={'http': 'http://*****:*****@121.41.79.4:18889', 'https': 'https://*****:*****@121.41.79.4:18889'})
            #res = self.request_url(url, headers={"Content-Type": "application/json"}, data=param, proxies={'http': 'http://137.135.166.225:8120', 'https': 'https://137.135.166.225:8120'})
        else:
            res = self.request_url(url, headers={"Content-Type": "application/json"}, data=param, proxies=self.proxies_dict[self.get_tid()])
        return res

    def init_cname(self):
        with open("c_already_detail.txt","r") as f:
            for line in f:
                filter_name.add(line.strip())

    def wait_q_breakable(self):
        lt = 0
        while True:
            if not self.job_queue.empty() or not self.job_queue2.empty() or not self.job_queue3.empty():
                time.sleep(5)
            if time.time() < lt + 1 and self._running_count == 0:
                return True
            time.sleep(2)
            lt = time.time()
            if self._worker_count == 0:
                return False

    def dispatch(self):
        #with open("a_queried_company_list.txt","r") as f:
        with open("un_spider_queries.txt", "r") as f:
            cnt = 0
            for line in f:
                line = line.strip()
                cnt += 1
                if line in filter_name:
                    print cnt, "already spider!!!"
                    continue
                job = {"line":line, "cnt":cnt, "retry":0}
                self.add_job(job, True)
        self.wait_q_breakable()
        self.add_job(None, True)


    def record_spider(self, line, cname):
        """
        已经爬过的,无论成功失败都算爬过.
        """
        filter_name.add(line)
        self.already_cname_list.append(line)
        self.bloom.add(cname)

    def run_job(self, jobid):
        line = jobid.get("line")
        cnt = jobid.get("cnt")
        retry = jobid.get("retry")
        self.get_detail(line, cnt, retry)

    def get_detail(self, line, cnt, retry):
        tid = self.get_tid()
        try:
            param = eval(line)
        except Exception as err:
            print 'tid=%d --- cnt=%d --- data is not json, return'%(tid, cnt)
            self.record_spider(line,'UNKNOW')
            return
        cname = param['oc_name']
        if cname in self.bloom:
            cname = param['query_name']
            if cname in self.bloom:
                print 'query_name:%s aleready crawler...'%cname
                return
        ccode = param['oc_code']
        carea = param['oc_area']
        url = "http://appsvc.qiye.qianzhan.com/OrgCompany.svc/orgcompany/combine/detail"
        encryptedJson = {
            "bl_oc_code" : ccode,#code,  #"71526726X"
            "v1" : "QZOrgV005",
            "isDirect" : "0",
            "bl_oc_name" : cname,#cname,  #"腾讯科技"
            "bl_oc_area" : carea #area #"4403"
        }
        res = self.req_all(url, encryptedJson)
        res_code = 0
        if res is None :
            if self.get_fail_cnt(1, 'failcount-none') < 10:
                self.re_add_job({'line':line,'cnt':cnt, 'retry':retry})
                print "tid=%d --- cnt=%d --- cname=%s --- retry=%d --- res.code=%d " % (tid, cnt, cname, retry, res_code)
                return
            else:
                # if retry > 5:
                #     self.query_failure.append(line)
                #     self.record_spider(line, cname)
                #     return
                # else:
                self.re_add_job({'line':line,'cnt':cnt, 'retry':(retry+1)})
                self._can_use_proxy_num -= 1
                raise AccountErrors.NoAccountError("Maybe the proxy invalid,failcount-none = [ %d ]" % self.get_fail_cnt(0, 'failcount-none'))
        else:
            setattr(self._curltls, 'failcount-none', 0)

        res_code = res.code
        if (res_code >= 400 and res_code < 500) or res_code == 202 :
            #print time.time(),"出现################",(time.time()-self.init_time), " res.code=", res_code
            # if retry > 20:
            #     self.query_failure.append(line)
            #     self.record_spider(line, cname)
            # else:
            self.re_add_job({'line':line,'cnt':cnt, 'retry':(retry+1)})
            print "tid=%d --- cnt=%d --- cname=%s --- retry=%d --- res.code=%d " % (tid, cnt, cname, retry, res_code)
            if self.get_fail_cnt(1, 'failcount-400') > 30:
                self._can_use_proxy_num -= 1
                raise AccountErrors.NoAccountError("Maybe the proxy invalid,failcount-400 = [ %d ]" % self.get_fail_cnt(0, 'failcount-400'))
            return
        else:
            setattr(self._curltls, 'failcount-400', 0)

        if res_code >= 500:
            # if retry > 5:
            #     self.query_failure.append(line)
            #     self.record_spider(line, cname)
            # else:
            self.re_add_job({'line':line,'cnt':cnt, 'retry':(retry+1)})
            print "tid=%d --- cnt=%d --- cname=%s --- retry=%d --- res.code=%d " % (tid, cnt, cname, retry, res_code)
            time.sleep(2)
            return
        elif res_code == 200:
            try:
                c = eval(res.text)['c']
            except Exception as err:
                print "tid=%d --- cnt=%d --- cname=%s --- retry=%d --- res.code=%d  exception res.text " % (tid, cnt, cname, retry, res_code)
                #print "exception res.text:\n", res.text
                self.query_failure.append(line)
                self.record_spider(line, cname)
                return
            if len(c) == 0:
                print "tid=%d --- cnt=%d --- cname=%s --- retry=%d --- res.code=%d   --- exception 'C' IS NULL" % (tid, cnt, cname, retry, res_code)
                self.query_failure.append(line)
                self.record_spider(line, cname)
                return
            result = CCIQ_AES("BF1856A312580D41256311147089E1CC").decrypt(c)
            try:
                detail = eval(result)
            except Exception as err:
                print "tid=%d --- cnt=%d --- cname=%s --- retry=%d --- res.code=%d  --- exception result:%s" % (tid, cnt, cname, retry, res_code, result)
                self.query_failure.append(line)
                self.record_spider(line, cname)
                return

            #print 'tid=', tid, 'proxy=', self.proxies_dict[tid], ' detail=',spider.util.utf8str(detail)
            #print 'tid=', tid, ' detail=',spider.util.utf8str(detail)

            #股东信息
            listGD = self.get_gd(carea, ccode, cname, 0)
            if listGD is not None:
                #print "tid=",tid," listGD=",spider.util.utf8str(listGD)
                detail['listGD'] = listGD['listGD']

            #投资信息
            list_inversted = self.get_inversted(cname, 0)
            if list_inversted is not None:
                #print "tid=",tid," list_inversted=",spider.util.utf8str(list_inversted)
                detail['inversted'] = list_inversted['inversted']

            #获取分支机构信息
            list_branch = self.get_branch(cname, 1, {"Branch": []}, 0)
            if list_branch is not None:
                #print "tid=",tid," list_branch=",spider.util.utf8str(list_branch)
                detail['Branch'] = list_branch['Branch']

            self.query_success.append(spider.util.utf8str(detail))
            self.record_spider(line, cname)

            print "tid=%d --- proxy=%s --- cnt=%d --- cname=%s --- retry=%d --- res.code=%d  --- success:\n %s" % (tid,self.proxies_dict[tid], cnt, cname, retry, res_code, spider.util.utf8str(detail))
        else:
            self.query_failure.append(line)
            self.record_spider(line, cname)
            print "tid=%d --- cnt=%d --- cname=%s --- retry=%d --- res.code=%d  --- exception UNKNOW ERROR" % (tid, cnt, cname, retry, res_code)
            return



    def get_gd(self, area, code, cname, retry):
        """
        获取股东信息
        """
        url = "http://appsvc.qiye.qianzhan.com/OrgCompany.svc/orgcompany/gd/detail"
        encryptedJson = {
            "bl_oc_area" : area, #4107
            "v1" : "QZOrgV005",
            "bl_oc_code" : code #672867774
        }
        res = self.req_all(url, encryptedJson)
        res_code = 0
        if res is None or (res.code >= 400 and res.code < 500):
            if res is not None:
                res_code = res.code
            print "get_gd --- cname=%s --- retry=%d --- res.code=%d" % (cname, retry, res_code)
            if retry < 5:
                time.sleep(0.1)
                return self.get_gd(area, code, cname, (retry+1))
            else:
                return None
        res_code = res.code
        if res_code >= 500:
            print "get_gd --- cname=%s --- retry=%d --- res.code=%d" % (cname, retry, res_code)
            time.sleep(1)
            return self.get_gd(area, code, cname, retry)
        elif res_code == 200:
            try:
                c = eval(res.text)['c']
            except Exception as err:
                print "get_gd --- cname=%s --- retry=%d --- res.code=%d  " % (cname, retry, res_code)
                print "get_gd --- exception res.text:\n", res.text
                if retry < 5:
                    time.sleep(0.1)
                    return self.get_gd(area, code, cname, (retry+1))
                else:
                    return None
            if len(c) == 0:
                print "get_gd --- cname=%s --- retry=%d --- res.code=%d  len(c)=0" % (cname, retry, res_code)
                if retry < 5:
                    time.sleep(0.1)
                    return self.get_gd(area, code, cname, (retry+1))
                else:
                    return None
            result = CCIQ_AES("BF1856A312580D41256311147089E1CC").decrypt(c)
            try:
                list_gd = eval(result)
            except Exception as err:
                print "get_gd --- cname=%s --- retry=%d --- res.code=%d " % (cname, retry, res_code)
                print 'get_gd --- eval(result) exception , result:\n',result
                if retry < 5:
                    time.sleep(0.1)
                    return self.get_gd(area, code, cname, (retry+1))
                else:
                    return None
            return list_gd
        else:
            print "get_gd --- cname=%s --- retry=%d --- res.code=%d ---UNKNOW ERROR" % (cname, retry, res_code)
            if retry < 5:
                time.sleep(0.1)
                return self.get_gd(area, code, cname, (retry+1))
            else:
                return None


    def get_inversted(self, cname, retry):
        """
        查询投资信息
        """
        url = "http://appsvc.qiye.qianzhan.com/OrgCompany.svc/orgcompany/map/invesment"
        encryptedJson = {
            "input" : cname,
            "v1" : "QZOrgV005"
        }

        res = self.req_all(url, encryptedJson)
        res_code = 0
        if res is None or (res.code >= 400 and res.code < 500):
            if res is not None:
                res_code = res.code
            print "get_inversted --- cname=%s --- retry=%d --- res.code=%d" % (cname, retry, res_code)
            if retry < 5:
                return self.get_inversted(cname, (retry+1))
            else:
                return None

        res_code = res.code
        if res_code >= 400 and res_code < 500:
            print "get_inversted --- cname=%s --- retry=%d --- res.code=%d" % (cname, retry, res_code)
            if retry < 5:
                return self.get_inversted(cname, (retry+1))
            else:
                return None
        elif res_code >= 500:
            print "get_inversted --- cname=%s --- retry=%d --- res.code=%d" % (cname, retry, res_code)
            time.sleep(1)
            return self.get_inversted(cname, retry)
        elif res.code == 200:
            try:
                c = eval(res.text)['c']
            except Exception as err:
                print "get_inversted --- cname=%s --- retry=%d --- res.code=%d" % (cname, retry, res_code)
                print "get_inversted --- exception res.text:\n", res.text
                if retry < 5:
                    time.sleep(0.1)
                    return self.get_inversted(cname, (retry+1))
                else:
                    return None
            if len(c) == 0:
                print "get_inversted --- cname=%s --- retry=%d --- res.code=%d" % (cname, retry, res_code)
                if retry < 5:
                    time.sleep(0.1)
                    return self.get_inversted(cname, (retry+1))
                else:
                    return None
            result = CCIQ_AES("BF1856A312580D41256311147089E1CC").decrypt(c)
            try:
                list_inversted = eval(result)
            except Exception as err:
                print "get_inversted --- cname=%s --- retry=%d --- res.code=%d" % (cname, retry, res_code)
                print 'get_inversted --- eval(result) exception , result:\n', result
                if retry < 5:
                    time.sleep(0.1)
                    return self.get_inversted(cname, (retry+1))
                else:
                    return None
            return list_inversted
        else:
            print "get_inversted --- cname=%s --- retry=%d --- res.code=%d ---UNKNOW ERROR" % (cname, retry, res_code)
            if retry < 5:
                time.sleep(0.1)
                return self.get_inversted(cname, (retry+1))
            else:
                return None


    def get_branch(self,cname, now_page, list_branch, retry):
        """
        查询分支机构
        """
        url = "http://appsvc.qiye.qianzhan.com/OrgCompany.svc/orgcompany/branch/select/page"
        encryptedJson = {
            "companyName" : cname,
            "v1" : "QZOrgV005",
            "page" : now_page,
            "pagesize" : "10"
        }

        res = self.req_all(url, encryptedJson)
        res_code = 0
        if res is None or (res.code >= 400 and res.code < 500):
            if res is not None:
                res_code = res.code
            print "get_branch --- cname=%s --- retry=%d --- now_page=%d --- res.code=%d" % (cname, retry, now_page , res_code)
            if retry < 5:
                time.sleep(0.1)
                return self.get_branch(cname,now_page, list_branch, (retry+1))
            else:
                return None

        res_code = res.code
        if res_code >= 500:
            print "get_branch --- cname=%s --- retry=%d --- now_page=%d --- res.code=%d" % (cname, retry, now_page , res_code)
            time.sleep(1)
            return self.get_branch(cname, now_page, list_branch, (retry+1))
        elif res_code == 200:
            try:
                c = eval(res.text)['c']
            except Exception as err:
                print "get_branch --- cname=%s --- retry=%d --- now_page=%d --- res.code=%d" % (cname, retry, now_page , res_code)
                print "get_branch --- exception res.text:\n", res.text
                if retry < 5:
                    time.sleep(0.1)
                    return self.get_branch(cname, now_page, list_branch, (retry+1))
                else:
                    return None
            if len(c) == 0:
                print "get_branch --- cname=%s --- retry=%d --- now_page=%d --- res.code=%d --- len(c)=0" % (cname, retry, now_page , res_code)
                if retry < 5:
                    time.sleep(0.1)
                    return self.get_branch(cname, now_page, list_branch, (retry+1))
                else:
                    return None
            result = CCIQ_AES("BF1856A312580D41256311147089E1CC").decrypt(c)
            temp = eval(result)
            if temp is not None:
                for t in temp['Branch']:
                    list_branch['Branch'].append(t)
                if len(temp['Branch']) == 10:
                    now_page += 1
                    # if now_page >= 10:
                    #     return list_branch
                    return self.get_branch(cname, now_page, list_branch, 0)
                else:
                    return list_branch
            else:
                print "get_branch --- cname=%s --- retry=%d --- now_page=%d --- res.code=%d --- Branch is NULL" % (cname, retry, now_page , res_code)
                return None
        else:
            print "get_branch --- cname=%s --- retry=%d --- now_page=%d --- res.code=%d --- UNKNOW ERROR" % (cname, retry, now_page , res_code)
            if retry < 5:
                time.sleep(1)
                return self.get_branch(cname, now_page, list_branch, (retry+1))
            else:
                return None



    def get_fail_cnt(self, addv , type):
        fc = getattr(self._curltls,type,0)
        if (addv):
            fc += addv
            setattr(self._curltls, type, fc)
        return fc

    def event_handler(self, evt, msg, **kwargs):
        if evt == 'DONE':
            msg += '企业查询宝APP公司详情detail查询已经停止...'
            spider.util.sendmail('*****@*****.**', '%s DONE' % sys.argv[0], msg)

    def read_proxy(self,fn):
        with open(fn, 'r') as f:
            for line in f:
                line = line.strip()
                self._match_proxy(line)
                # m = re.match(r'(\d+\.\d+\.\d+\.\d+:\d+)', line, re.I)
                # m1 = re.match(r'(\d+\.\d+\.\d+\.\d+:\d+:\w+:\w+)', line, re.I)
                # if m:
                #     prstr = m.group(1)
                #     proxies = {'http': 'http://' + prstr+"/", 'https': 'https://' + prstr+"/"}
                #     self.proxies_dict.append(proxies)
                # elif re.match('\s*#', line):
                #     continue
        print " loaded [ %d ] proxis " % len(self.proxies_dict)


    def _match_proxy(self,line):
        m = re.match('([0-9.]+):(\d+):([a-z0-9]+):([a-z0-9._-]+)$', line, re.I)
        m1 = re.match('([0-9.]+):(\d+):([a-z0-9]+)$', line, re.I)
        if m:
            prstr = '%s:%s@%s:%s' % (m.group(3), m.group(4), m.group(1), m.group(2))
            proxies = {'http': 'http://' + prstr, 'https': 'https://' + prstr}
        elif m1:
            prstr = '%s:%s' % (m1.group(1), m1.group(2))
            proxies = {'http': 'http://' + prstr, 'https': 'https://' + prstr}
        else:
            proxies = {'http': 'http://' + line, 'https': 'https://' + line}
        self.proxies_dict.append(proxies)
Пример #19
0
class QycxbQuery(Spider):
    """
    根据企业名称.查询企业列表------针对900多万的公司名查询
    """
    def __init__(self):
        self.is_debug = True
        self._can_use_proxy_num = 0
        if self.is_debug:
            Spider.__init__(self, 80)
        else:
            self.proxies_dict = []
            self.read_proxy("../../_ct_proxy/proxy_all_filter.txt")
            Spider.__init__(self, len(self.proxies_dict))
        self.error_cnt = 0
        self._aes_ = CCIQ_AES()
        #根据公司名字查询到的公司列表全部信息
        self.query_company_list = FileSaver("all_company_list.txt")

        #已经爬取过的公司名
        self.already_cname_list = FileSaver("all_company_list_already.txt")

        #爬过的 错误类型
        self.already_error_type = FileSaver("all_already_error_type.txt")

        self.need_flip_page_data = FileSaver("beijing_need_flip_page_data.txt")

        #初始化已经爬过的公司
        self.init_cname()
        self.extJsons = ["Hoi6oX70l9whauZmjq8jVAmoe3UspXXhX9mPG+KAeqs1rKZVr/uapICH92P/Crryt63u28aP4QP665AzcT/jN5Go1o3bvwMvVIkuN9e60k6WI2pVFBrwZMvxwW6BnQukSzDSlyPvEhgpR5DIHQEV6C51hMgp4Zc3OkTSsyezAm4=",
                         "ctlCXDvoyaH2pCIArrgvXp7zrZTzpz2Q5rukh+aWvupEFABw6P2AvbmaN+HJ7IZgDJ/kgBkJt/rLppSGitYCPKGR2IGv6OXZsrJGgbRB3G3Ac4K8xpX3aMB5s8Ci2a/YpTpioZxAvptqJsQUCoNn0tLCOVM4XxMJQWbrErkOcl4=",
                         "ctlCXDvoyaH2pCIArrgvXp7zrZTzpz2Q5rukh+aWvupEFABw6P2AvbmaN+HJ7IZgDJ/kgBkJt/rLppSGitYCPKGR2IGv6OXZsrJGgbRB3G1U2wdOlL49/aDwt3NZNp4TGa5iBFpYLm69F/6PPFoXIR/Aw5p48//8OgZFpddDUwQ="]

        self.user_agents = ["=CCIQ/2.0.1 (iPhone; iOS 9.1; Scale/2.00)",
                            "=CCIQ/2.0.1 (iPhone; iOS 8.1.3; Scale/2.00)",
                            "=CCIQ/2.0.1 (iPhone; iOS 8.4; Scale/2.00)"]

        self.bloom = set()
        self.proxy_error_cnt = 0
        self.lock = threading.Lock()

    def req_all(self, encryptedJson, retry=0, cname=None):
        url = "http://appsvc.qiye.qianzhan.com/OrgCompany.svc/orgcompany/combine/search"
        number = random.randrange(0, 3, 1)
        self.select_user_agent(self.user_agents[number])
        param = spider.util.utf8str({"encryptedJson": self._aes_.encrypt(spider.util.utf8str(encryptedJson)), "extJson": self.extJsons[number]})
        param = param.replace('/', "\/")
        res = None
        if self.is_debug:
            res = self.request_url(url, headers={"Content-Type": "application/json", "Accept-Language": "zh-Hans-CN;q=1"}, timeout=20, data=param, proxies={'http': 'http://*****:*****@192.168.1.39:3428', 'https': 'https://*****:*****@192.168.1.39:3428'})
        else:
            res = self.request_url(url, headers={"Content-Type": "application/json", "Accept-Language": "zh-Hans-CN;q=1"}, data=param, proxies=self.proxies_dict[self.get_tid()])
        if res is None or res.code != 200:
            print "访问错误", cname, "res is none" if res is None else "res.code=%d" % (res.code)
            self.error_add()
            if retry < 10:
                time.sleep(random.randrange(1, 5, 1))
                return self.req_all(encryptedJson, retry=(retry+1))
        return res


    def init_cname(self):
        i = 0
        with open("all_company_list_already.txt", "r") as f:
            for line in f:
                i += 1
                filter_name.add(line.strip())
        print "init already query company name finish...", i

    def error_add(self):
        pass
        # with self.lock:
        #     self.proxy_error_cnt += 1
        #     if self.proxy_error_cnt > 200:
        #         self.restart_jb()

    def restart_jb(self):
        if self.proxy_error_cnt < 200:
            return
        self.proxy_error_cnt = 0
        print "=============================重新启动拨号脚本================================="
        os.system("sshpass -p 'helloipin' ssh [email protected] /home/ipin/bin/redial")
        time.sleep(10)
        os.system("sshpass -p 'helloipin' ssh [email protected] /home/ipin/bin/getip")
        print "=============================重新启动拨号脚本成功=============================="


    def wait_q_breakable(self):
        lt = 0
        while True:
            if not self.job_queue.empty() or not self.job_queue2.empty() or not self.job_queue3.empty():
                time.sleep(5)
            if time.time() < lt + 1 and self._running_count == 0:
                return True
            time.sleep(2)
            lt = time.time()
            if self._worker_count == 0:
                return False

    def dispatch(self):
        with open("beijing_cname.txt", "r") as f:
            cnt = 0
            for line in f:
                line = line.strip()
                cnt += 1
                if line in filter_name:
                    #print cnt, line, "already spider!!!"
                    continue
                job = {"cname": line, "cnt": cnt, "retry": 0}
                self.add_job(job, True)
        self.wait_q_breakable()
        self.add_job(None, True)


    def record_spider(self,line):
        """
        已经爬过的,无论成功失败都算爬过.
        """
        filter_name.add(line)
        self.already_cname_list.append(line)
        self.proxy_error_cnt = 0

    def run_job(self, job):
        cname = job.get("cname")
        cnt = job.get("cnt")
        retry = job.get("retry")
        if cname is None:
            print 'cname = ', cnt, ' is None ,break~'
            return
        self.flip_over(1, cname, cnt, retry)


    def flip_over(self , now_page , cname , cnt , retry):
        tid = self.get_tid()
        """
        根据公司名查询公司列表,翻页
        """
        encryptedJson = {
  "pagesize" : "20",
  "page" : now_page,
  "od_orderBy" : "0",
  "sh_searchType" : "一般搜索",
  "sh_oc_areaName" : "",
  "od_statusFilter" : "0",
  "v1" : "QZOrgV005",
  "oc_name" : cname,
  "sh_u_uid" : "",
  "sh_u_name" : ""
}
        r_result = {"cname": cname}
        res = self.req_all(encryptedJson, cname=cname)
        res_code = 0
        if res is None:
            self.re_add_job({'cname': cname, 'cnt': cnt, 'retry': retry})
            return

        if u"处理请求时服务器遇到错误。有关详细信息,请参见服务器日志" in res.text:
            print "处理请求时服务器遇到错误。有关详细信息,请参见服务器日志..."
            if retry < 3:
                self.re_add_job({'cname': cname, 'cnt': cnt, 'retry': (retry+1)})
            else:
                r_result["type"] = "request-server-error"
                self.already_error_type.append(spider.util.utf8str(r_result))
                self.record_spider(cname)
            return

        try:
            if u"服务不可用。" in res.text or u"Unauthorized!" in res.text:
                self.re_add_job({'cname': cname, 'cnt': cnt, 'retry': retry})
                print "系统不可用...", cname, res.text
                return
            c = eval(res.text)['c']
        except Exception as err:
            print "tid=%d --- cnt=%d --- cname=%s --- retry=%d --- res.code=%d  --- now_page:%d --- exception:%s  --- res.text: %s " % (tid, cnt, cname, retry, res_code, now_page, err, spider.util.utf8str(res.text))
            if retry < 3:
                self.re_add_job({'cname': cname, 'cnt': cnt, 'retry': (retry+1)})
            else:
                r_result["type"] = "res.text=invalid"
                self.already_error_type.append(spider.util.utf8str(r_result))
                self.record_spider(cname)
            return
        if len(c) == 0:
            print "tid=%d --- cnt=%d --- cname=%s --- retry=%d --- res.code=%d  --- now_page:%d --- exception 'C' IS NULL" % (tid, cnt, cname, retry, res_code, now_page)
            r_result["type"] = "c=0"
            self.already_error_type.append(spider.util.utf8str(r_result))
            self.record_spider(cname)
            self.error_cnt += 1
            return
        result = CCIQ_AES("BF1856A312580D41256311147089E1CC").decrypt(c)
        try:
            dic = eval(result)
        except Exception as err:
            print "tid=%d --- cnt=%d --- cname=%s --- retry=%d --- res.code=%d  --- now_page:%d --- exception result:%s" % (tid, cnt, cname, retry, res_code, now_page, result)
            r_result["type"] = "result_error"
            self.already_error_type.append(spider.util.utf8str(r_result))
            self.record_spider(cname)
            self.error_cnt += 1
            return
        list = dic['list']
        if len(list) == 0:
            print "tid=%d --- cnt=%d --- cname=%s --- retry=%d --- res.code=%d --- now_page:%d --- exception len(list)=0" % (tid, cnt, cname, retry, res_code, now_page)
            r_result["type"] = "list=0"
            self.already_error_type.append(spider.util.utf8str(r_result))
            self.record_spider(cname)
            self.error_cnt += 1
            return
        for l in list:
            aa = {"query_name": cname}
            for k, v in l.items():
                aa[k] = v
            self.query_company_list.append(spider.util.utf8str(aa))
        print cnt, "******", len(list), spider.util.utf8str(list)
        if len(list) < 20:
            self.record_spider(cname)
            return
        elif len(list) == 20:
            if now_page > 3:
                #self.already_error_type.append(spider.util.utf8str(r_result))
                self.need_flip_page_data.append(spider.util.utf8str(encryptedJson))
                self.record_spider(cname)
                return
            now_page += 1
            self.flip_over(now_page, cname, cnt, retry)


    def get_fail_cnt(self, type_key, addv):
        fc = getattr(self._curltls, type_key, 0)
        if (addv):
            fc += addv
            setattr(self._curltls, type_key, fc)
        return fc

    def event_handler(self, evt, msg, **kwargs):
        if evt == 'DONE':
            msg += '企业查询宝APP公司列表查询已经停止...错误数:'+str(self.error_cnt)
            spider.util.sendmail('*****@*****.**', '%s DONE' % sys.argv[0], msg)

    def read_proxy(self,fn):
        with open(fn, 'r') as f:
            for line in f:
                line = line.strip()
                self._match_proxy(line)
        self._can_use_proxy_num = len(self.proxies_dict)
        print " loaded [ %d ] proxis " % self._can_use_proxy_num

    def _match_proxy(self,line):
        m = re.match('([0-9.]+):(\d+):([a-z0-9]+):([a-z0-9._-]+)$', line, re.I)
        m1 = re.match('([0-9.]+):(\d+):([a-z0-9]+)$', line, re.I)
        if m:
            prstr = '%s:%s@%s:%s' % (m.group(3), m.group(4), m.group(1), m.group(2))
            proxies = {'http': 'http://' + prstr, 'https': 'https://' + prstr}
        elif m1:
            prstr = '%s:%s' % (m1.group(1), m1.group(2))
            proxies = {'http': 'http://' + prstr, 'https': 'https://' + prstr}
        else:
            proxies = {'http': 'http://' + line, 'https': 'https://' + line}
        self.proxies_dict.append(proxies)
Пример #20
0
    def flip_over(self, now_page, cname):
        url = "http://appsvc.qiye.qianzhan.com/OrgCompany.svc/orgcompany/combine/search"
        headers = {"Content-Type": "application/json"}
        encryptedJson = {
            "pagesize": "20",
            "page": now_page,
            "od_orderBy": "0",
            "sh_searchType": "一般搜索",
            "od_statusFilter": "0",
            "v1": "QZOrgV004",
            "oc_name": cname,
            "sh_u_uid": "",
            "sh_u_name": ""
        }
        extJson = {
            "cl_screenSize": "640x960",
            "cl_cookieId": "16923697-D73E-485A-BDCF-68FAD456AC02",
            "Org_iOS_Version": "2.0.1"
        }
        param = {
            "encryptedJson":
            self._aes_.encrypt(spider.util.utf8str(encryptedJson)),
            "extJson": self._aes_.encrypt(spider.util.utf8str(extJson))
        }
        param = spider.util.utf8str(param)
        res = self.request_url(url, headers=headers, data=param)
        if res is None:
            print 'res is none -- search company name is -->', cname
            self.fail_name.append(cname)
            return
        elif res.code == 404:
            print "%s ------ 404" % cname
            return
        elif res.code == 503 or res.code == 500 or res.code == 502 or res.code == 504:
            print "%s ------ %d " % (cname, res.code)
            self.add_job({'cname': cname})
            time.sleep(0.5)
            return
        elif res.code == 200:
            c = eval(res.text)['c']
            if len(c) == 0:
                print '-----------------------------cname %s res.text is null----------------------------' % cname
                return
            result = CCIQ_AES("BB1856A312580D41256311147089E0CC").decrypt(c)
            dic = eval(result)
            list = dic['list']
            if len(list) == 0:
                print 'cname %s result list length = 0 ' % cname
                return
            print 'cname %s result ################### now get list length is %d' % (
                cname, len(list))
            for l in list:
                aa = {}
                for k, v in l.items():
                    aa[k] = v
                self.save_success.append(spider.util.utf8str(aa))
                x = cname + "|" + l['oc_name'] + "|" + str(
                    l['oc_area']) + "|" + str(l['oc_code']) + "|" + str(
                        l['oc_number'])
                self.part_success.append(x)

            print "-------------------------------------------cname %s page %d finish-----------------------------------" % (
                cname, now_page)
            rowcount = dic['rowcount']
            print "==============cname %s=======page %d=========rowcount %d===========" % (
                cname, now_page, rowcount)
            # page_count = rowcount/20 if rowcount%20==0 else (rowcount/20+1)
            # if now_page < page_count:
            #     now_page += 1
            #     self.flip_over(now_page,cname)
            # time.sleep(0.1)
            now_page += 1
            time.sleep(0.1)
            self.flip_over(now_page, cname)
            return
        else:
            print "cname %s #######################################UNKNOWN ERROR############################################# [ %d ]" % (
                cname, res.code)
Пример #21
0
    def get_detail(self, line, cnt, retry):
        tid = self.get_tid()
        try:
            param = eval(line)
        except Exception as err:
            print 'tid=%d --- cnt=%d --- data is not json, return'%(tid, cnt)
            self.record_spider(line,'UNKNOW')
            return
        cname = param['oc_name']
        if cname in self.bloom:
            cname = param['query_name']
            if cname in self.bloom:
                print 'query_name:%s aleready crawler...'%cname
                return
        ccode = param['oc_code']
        carea = param['oc_area']
        url = "http://appsvc.qiye.qianzhan.com/OrgCompany.svc/orgcompany/combine/detail"
        encryptedJson = {
            "bl_oc_code" : ccode,#code,  #"71526726X"
            "v1" : "QZOrgV005",
            "isDirect" : "0",
            "bl_oc_name" : cname,#cname,  #"腾讯科技"
            "bl_oc_area" : carea #area #"4403"
        }
        res = self.req_all(url, encryptedJson)
        res_code = 0
        if res is None :
            if self.get_fail_cnt(1, 'failcount-none') < 10:
                self.re_add_job({'line':line,'cnt':cnt, 'retry':retry})
                print "tid=%d --- cnt=%d --- cname=%s --- retry=%d --- res.code=%d " % (tid, cnt, cname, retry, res_code)
                return
            else:
                # if retry > 5:
                #     self.query_failure.append(line)
                #     self.record_spider(line, cname)
                #     return
                # else:
                self.re_add_job({'line':line,'cnt':cnt, 'retry':(retry+1)})
                self._can_use_proxy_num -= 1
                raise AccountErrors.NoAccountError("Maybe the proxy invalid,failcount-none = [ %d ]" % self.get_fail_cnt(0, 'failcount-none'))
        else:
            setattr(self._curltls, 'failcount-none', 0)

        res_code = res.code
        if (res_code >= 400 and res_code < 500) or res_code == 202 :
            #print time.time(),"出现################",(time.time()-self.init_time), " res.code=", res_code
            # if retry > 20:
            #     self.query_failure.append(line)
            #     self.record_spider(line, cname)
            # else:
            self.re_add_job({'line':line,'cnt':cnt, 'retry':(retry+1)})
            print "tid=%d --- cnt=%d --- cname=%s --- retry=%d --- res.code=%d " % (tid, cnt, cname, retry, res_code)
            if self.get_fail_cnt(1, 'failcount-400') > 30:
                self._can_use_proxy_num -= 1
                raise AccountErrors.NoAccountError("Maybe the proxy invalid,failcount-400 = [ %d ]" % self.get_fail_cnt(0, 'failcount-400'))
            return
        else:
            setattr(self._curltls, 'failcount-400', 0)

        if res_code >= 500:
            # if retry > 5:
            #     self.query_failure.append(line)
            #     self.record_spider(line, cname)
            # else:
            self.re_add_job({'line':line,'cnt':cnt, 'retry':(retry+1)})
            print "tid=%d --- cnt=%d --- cname=%s --- retry=%d --- res.code=%d " % (tid, cnt, cname, retry, res_code)
            time.sleep(2)
            return
        elif res_code == 200:
            try:
                c = eval(res.text)['c']
            except Exception as err:
                print "tid=%d --- cnt=%d --- cname=%s --- retry=%d --- res.code=%d  exception res.text " % (tid, cnt, cname, retry, res_code)
                #print "exception res.text:\n", res.text
                self.query_failure.append(line)
                self.record_spider(line, cname)
                return
            if len(c) == 0:
                print "tid=%d --- cnt=%d --- cname=%s --- retry=%d --- res.code=%d   --- exception 'C' IS NULL" % (tid, cnt, cname, retry, res_code)
                self.query_failure.append(line)
                self.record_spider(line, cname)
                return
            result = CCIQ_AES("BF1856A312580D41256311147089E1CC").decrypt(c)
            try:
                detail = eval(result)
            except Exception as err:
                print "tid=%d --- cnt=%d --- cname=%s --- retry=%d --- res.code=%d  --- exception result:%s" % (tid, cnt, cname, retry, res_code, result)
                self.query_failure.append(line)
                self.record_spider(line, cname)
                return

            #print 'tid=', tid, 'proxy=', self.proxies_dict[tid], ' detail=',spider.util.utf8str(detail)
            #print 'tid=', tid, ' detail=',spider.util.utf8str(detail)

            #股东信息
            listGD = self.get_gd(carea, ccode, cname, 0)
            if listGD is not None:
                #print "tid=",tid," listGD=",spider.util.utf8str(listGD)
                detail['listGD'] = listGD['listGD']

            #投资信息
            list_inversted = self.get_inversted(cname, 0)
            if list_inversted is not None:
                #print "tid=",tid," list_inversted=",spider.util.utf8str(list_inversted)
                detail['inversted'] = list_inversted['inversted']

            #获取分支机构信息
            list_branch = self.get_branch(cname, 1, {"Branch": []}, 0)
            if list_branch is not None:
                #print "tid=",tid," list_branch=",spider.util.utf8str(list_branch)
                detail['Branch'] = list_branch['Branch']

            self.query_success.append(spider.util.utf8str(detail))
            self.record_spider(line, cname)

            print "tid=%d --- proxy=%s --- cnt=%d --- cname=%s --- retry=%d --- res.code=%d  --- success:\n %s" % (tid,self.proxies_dict[tid], cnt, cname, retry, res_code, spider.util.utf8str(detail))
        else:
            self.query_failure.append(line)
            self.record_spider(line, cname)
            print "tid=%d --- cnt=%d --- cname=%s --- retry=%d --- res.code=%d  --- exception UNKNOW ERROR" % (tid, cnt, cname, retry, res_code)
            return
Пример #22
0
class QycxbApp(Spider):
    """
    根据企业名称 访问接口 获得公司是否存在,存在则拿出其注册号和公司名称保存,不存在则忽略
    """
    def __init__(self):
        #self.proxies_dict = []
        #self.read_proxy("../spider/proxy/proxy.txt")
        #Spider.__init__(self, len(self.proxies_dict))
        Spider.__init__(self, 1)
        self.num_count = 0
        self._aes_ = CCIQ_AES()
        #APP可以拿到的公司全部信息
        self.save_success = FileSaver("exist_company.txt")
        #APP可以拿到的公司局部信息
        self.part_success = FileSaver("part_company.txt")
        #查询失败的公司名
        self.fail_name = FileSaver("fail_name.txt")

    def wait_q_breakable(self):
        lt = 0
        while True:
            if not self.job_queue.empty() or not self.job_queue2.empty(
            ) or not self.job_queue3.empty():
                time.sleep(5)
            if time.time() < lt + 1 and self._running_count == 0:
                return True
            time.sleep(2)
            lt = time.time()
            if self._worker_count == 0:
                return False

    def dispatch(self):
        with open("old-company.txt", "r") as f:
            while True:
                line = f.readline().strip()
                ary = line.split(" ")
                if len(ary) == 3:
                    #print 'read company name is ', ary[2]
                    job = {"cname": ary[2]}
                    self.add_job(job, True)
        self.wait_q_breakable()
        self.add_job(None, True)

    def get_fail_cnt(self, addv):
        fc = getattr(self._curltls, 'failcount', 0)
        if (addv):
            fc += addv
            setattr(self._curltls, 'failcount', fc)
        return fc

    def run_job(self, jobid):
        self.select_user_agent("=CCIQ/2.0.1 (iPhone; iOS 9.1; Scale/2.00)")
        cname = jobid.get("cname")
        self.flip_over(1, cname)

    def flip_over(self, now_page, cname):
        url = "http://appsvc.qiye.qianzhan.com/OrgCompany.svc/orgcompany/combine/search"
        headers = {"Content-Type": "application/json"}
        encryptedJson = {
            "pagesize": "20",
            "page": now_page,
            "od_orderBy": "0",
            "sh_searchType": "一般搜索",
            "od_statusFilter": "0",
            "v1": "QZOrgV004",
            "oc_name": cname,
            "sh_u_uid": "",
            "sh_u_name": ""
        }
        extJson = {
            "cl_screenSize": "640x960",
            "cl_cookieId": "16923697-D73E-485A-BDCF-68FAD456AC02",
            "Org_iOS_Version": "2.0.1"
        }
        param = {
            "encryptedJson":
            self._aes_.encrypt(spider.util.utf8str(encryptedJson)),
            "extJson": self._aes_.encrypt(spider.util.utf8str(extJson))
        }
        param = spider.util.utf8str(param)
        res = self.request_url(url, headers=headers, data=param)
        if res is None:
            print 'res is none -- search company name is -->', cname
            self.fail_name.append(cname)
            return
        elif res.code == 404:
            print "%s ------ 404" % cname
            return
        elif res.code == 503 or res.code == 500 or res.code == 502 or res.code == 504:
            print "%s ------ %d " % (cname, res.code)
            self.add_job({'cname': cname})
            time.sleep(0.5)
            return
        elif res.code == 200:
            c = eval(res.text)['c']
            if len(c) == 0:
                print '-----------------------------cname %s res.text is null----------------------------' % cname
                return
            result = CCIQ_AES("BB1856A312580D41256311147089E0CC").decrypt(c)
            dic = eval(result)
            list = dic['list']
            if len(list) == 0:
                print 'cname %s result list length = 0 ' % cname
                return
            print 'cname %s result ################### now get list length is %d' % (
                cname, len(list))
            for l in list:
                aa = {}
                for k, v in l.items():
                    aa[k] = v
                self.save_success.append(spider.util.utf8str(aa))
                x = cname + "|" + l['oc_name'] + "|" + str(
                    l['oc_area']) + "|" + str(l['oc_code']) + "|" + str(
                        l['oc_number'])
                self.part_success.append(x)

            print "-------------------------------------------cname %s page %d finish-----------------------------------" % (
                cname, now_page)
            rowcount = dic['rowcount']
            print "==============cname %s=======page %d=========rowcount %d===========" % (
                cname, now_page, rowcount)
            # page_count = rowcount/20 if rowcount%20==0 else (rowcount/20+1)
            # if now_page < page_count:
            #     now_page += 1
            #     self.flip_over(now_page,cname)
            # time.sleep(0.1)
            now_page += 1
            time.sleep(0.1)
            self.flip_over(now_page, cname)
            return
        else:
            print "cname %s #######################################UNKNOWN ERROR############################################# [ %d ]" % (
                cname, res.code)

    def event_handler(self, evt, msg, **kwargs):
        if evt == 'DONE':
            msg += '企业查询宝APP爬取已经停止...'
            spider.util.sendmail('*****@*****.**', '%s DONE' % sys.argv[0],
                                 msg)

    def read_proxy(self, fn):
        with open(fn, 'r') as f:
            for line in f:
                line = line.strip()
                m = re.match(r'(\d+\.\d+\.\d+\.\d+:\d+)', line, re.I)
                if m:
                    prstr = m.group(1)
                    proxies = {
                        'http': 'http://' + prstr + "/",
                        'https': 'https://' + prstr + "/"
                    }
                    self.proxies_dict.append(proxies)
                elif re.match('\s*#', line):
                    continue
        print " loaded [ %d ] proxis " % len(self.proxies_dict)
Пример #23
0
    def flip_over(self, now_page, cname, line, cnt, retry):
        url = "http://appsvc.qiye.qianzhan.com/OrgCompany.svc/orgcompany/combine/search"
        headers = {"Content-Type": "application/json"}
        encryptedJson = {
            "pagesize": "20",
            "page": now_page,
            "od_orderBy": "0",
            "sh_searchType": "一般搜索",
            "od_statusFilter": "0",
            "v1": "QZOrgV004",
            "oc_name": cname,
            "sh_u_uid": "",
            "sh_u_name": ""
        }
        param = {
            "encryptedJson":
            self._aes_.encrypt(spider.util.utf8str(encryptedJson)),
            "extJson": self.extJson
        }
        param = spider.util.utf8str(param)
        res = self.request_url(url,
                               headers=headers,
                               data=param,
                               proxies=self.proxies_dict[self.get_tid()])
        if res is None:
            if self.get_fail_cnt(1) < 10:
                print "%d-----%s ------ res is None" % (cnt, cname)
                self.add_job({'line': line, 'cnt': cnt})
                return False
            else:
                print "id is [ %s ] thread and [ %s ] proxy will be close and drop." % (
                    self.get_tid(), self.proxies_dict[self.get_tid()])
                #self.query_company_info_failure.append(line)
                self.add_job({'line': line, 'cnt': cnt})
                raise AccountErrors.NoAccountError(
                    "Maybe the proxy[ %s ] invalid,failcount = [ %d ]" %
                    (self.proxies_dict[self.get_tid()], self.get_fail_cnt(0)))

        elif res.code == 404 or res.code == 403:
            if self.get_fail_cnt(1) < 20:
                print "%d-----%s ------ %d" % (cnt, cname, res.code)
                self.add_job({'line': line, 'cnt': cnt})
                return False
            else:
                print "id is [ %s ] thread and [ %s ] proxy will be close and drop." % (
                    self.get_tid(), self.proxies_dict[self.get_tid()])
                self.add_job({'line': line, 'cnt': cnt})
                raise AccountErrors.NoAccountError(
                    "Maybe the proxy[ %s ] invalid,failcount = [ %d ]" %
                    (self.proxies_dict[self.get_tid()], self.get_fail_cnt(0)))

        elif res.code == 503 or res.code == 500 or res.code == 502 or res.code == 504:
            print "%d------%s ------ %d " % (cnt, cname, res.code)
            self.add_job({'line': line, 'cnt': cnt})
            time.sleep(1)
            return False
        elif res.code == 200:
            c = eval(res.text)['c']
            if len(c) == 0:
                print '-----------------------------cname %s res.text is null----------------------------' % cname
                self.query_company_info_failure.append(line)
                return True
            result = CCIQ_AES("BB1856A312580D41256311147089E0CC").decrypt(c)
            dic = eval(result)
            list = dic['list']
            if len(list) == 0:
                print 'cname %s result list length = 0 ' % cname
                self.query_company_info_failure.append(line)
                return True
            print 'cname %s result ###################  list length ------ %d' % (
                cname, len(list))
            for l in list:
                aa = {}
                for k, v in l.items():
                    aa[k] = v
                self.query_company_info.append(spider.util.utf8str(aa))
                part = cname + "|" + l['oc_name'] + "|" + str(
                    l['oc_area']) + "|" + str(l['oc_code']) + "|" + str(
                        l['oc_number'])
                self.query_company_info_part.append(part)
                self.get_detail(l['oc_name'], l['oc_code'], l['oc_area'])
            if len(list) < 20:
                return True
            elif len(list) == 20:
                now_page += 1
                self.flip_over(now_page, cname, line, cnt)
        else:
            print "cname %s #######################################UNKNOWN ERROR############################################# [ %d ]" % (
                cname, res.code)
            self.query_company_info_failure.append(line)
            return True
Пример #24
0
class QycxbApp(Spider):
    """
    根据企业名称 查询公司列表(可能多个,可能不存在) , 存在则根据每条数据内容查询详情detail,查询完详情再请求股东信息,跟详情拼接在一起存储到文件detail_company.txt
    """
    def __init__(self):
        self.proxies_dict = []
        self.read_proxy("proxy_20160218.txt")
        Spider.__init__(self, len(self.proxies_dict))

        self.num_count = 0
        #self.filter_name = []
        self._aes_ = CCIQ_AES()
        #根据公司名字查询到的公司列表全部信息
        self.query_company_info = FileSaver("query_company_info.txt")
        #根据公司名字查询到的公司列表局部信息
        self.query_company_info_part = FileSaver("query_company_info_part.txt")
        #根据公司名字查询到的公司列表信息失败的
        self.query_company_info_failure = FileSaver(
            "query_company_info_failure.txt")
        #已经爬取过的公司名
        self.already_cname = FileSaver("already_cname.txt")
        #初始化已经爬过的公司
        self.init_cname()
        #查询详情失败的公司名
        self.detail_failure = FileSaver("detail_failure1.txt")
        #APP可以拿到的公司全部信息 包含股东信息
        self.detail_company = FileSaver("detail_company.txt")
        self.extJson = self._aes_.encrypt(
            spider.util.utf8str({
                "cl_screenSize": "640x960",
                "cl_cookieId": "16923697-D73E-485A-BDCF-68FAD456AC02",
                "Org_iOS_Version": "2.0.1"
            }))
        self.select_user_agent("=CCIQ/2.0.1 (iPhone; iOS 9.1; Scale/2.00)")

    def init_cname(self):
        with open("already_cname.txt", "r") as f:
            for line in f:
                filter_name.add(line.strip())

    def wait_q_breakable(self):
        lt = 0
        while True:
            if not self.job_queue.empty() or not self.job_queue2.empty(
            ) or not self.job_queue3.empty():
                time.sleep(5)
            if time.time() < lt + 1 and self._running_count == 0:
                return True
            time.sleep(2)
            lt = time.time()
            if self._worker_count == 0:
                return False

    def dispatch(self):
        with open("corp_name.txt", "r") as f:
            cnt = 0
            while True:
                line = f.readline().strip()
                cnt += 1
                if line is None:
                    break
                if line in filter_name:
                    print line, " already spider~~~~~~~~~~~~~~~~~~~~~~~~~~~~~"
                    continue
                job = {"line": line, "cnt": cnt, "retry": 0}
                self.add_job(job, True)
        self.wait_q_breakable()
        self.add_job(None, True)

    def get_fail_cnt(self, addv):
        fc = getattr(self._curltls, 'failcount', 0)
        if (addv):
            fc += addv
            setattr(self._curltls, 'failcount', fc)
        return fc

    def run_job(self, jobid):
        line = jobid.get("line")
        cnt = jobid.get("cnt")
        retry = jobid.get("retry")
        if line is None:
            print 'line = ', cnt, ' is None ,break~'
            return
        ary = line.split(" ")
        if len(ary) == 4:
            cname = ary[3]
            flag = self.flip_over(1, cname, line, cnt, retry)
            #爬取结束,加入到set并写入文件
            if flag:
                filter_name.add(line)
                self.already_cname.append(line)
                print cnt, ' execute perfect~~~~~~~~~~~~~~~~~~~~~~~'
        else:
            print '@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ company data line is error @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@', cnt

    #根据公司名查询公司列表,翻页
    def flip_over(self, now_page, cname, line, cnt, retry):
        url = "http://appsvc.qiye.qianzhan.com/OrgCompany.svc/orgcompany/combine/search"
        headers = {"Content-Type": "application/json"}
        encryptedJson = {
            "pagesize": "20",
            "page": now_page,
            "od_orderBy": "0",
            "sh_searchType": "一般搜索",
            "od_statusFilter": "0",
            "v1": "QZOrgV004",
            "oc_name": cname,
            "sh_u_uid": "",
            "sh_u_name": ""
        }
        param = {
            "encryptedJson":
            self._aes_.encrypt(spider.util.utf8str(encryptedJson)),
            "extJson": self.extJson
        }
        param = spider.util.utf8str(param)
        res = self.request_url(url,
                               headers=headers,
                               data=param,
                               proxies=self.proxies_dict[self.get_tid()])
        if res is None:
            if self.get_fail_cnt(1) < 10:
                print "%d-----%s ------ res is None" % (cnt, cname)
                self.add_job({'line': line, 'cnt': cnt})
                return False
            else:
                print "id is [ %s ] thread and [ %s ] proxy will be close and drop." % (
                    self.get_tid(), self.proxies_dict[self.get_tid()])
                #self.query_company_info_failure.append(line)
                self.add_job({'line': line, 'cnt': cnt})
                raise AccountErrors.NoAccountError(
                    "Maybe the proxy[ %s ] invalid,failcount = [ %d ]" %
                    (self.proxies_dict[self.get_tid()], self.get_fail_cnt(0)))

        elif res.code == 404 or res.code == 403:
            if self.get_fail_cnt(1) < 20:
                print "%d-----%s ------ %d" % (cnt, cname, res.code)
                self.add_job({'line': line, 'cnt': cnt})
                return False
            else:
                print "id is [ %s ] thread and [ %s ] proxy will be close and drop." % (
                    self.get_tid(), self.proxies_dict[self.get_tid()])
                self.add_job({'line': line, 'cnt': cnt})
                raise AccountErrors.NoAccountError(
                    "Maybe the proxy[ %s ] invalid,failcount = [ %d ]" %
                    (self.proxies_dict[self.get_tid()], self.get_fail_cnt(0)))

        elif res.code == 503 or res.code == 500 or res.code == 502 or res.code == 504:
            print "%d------%s ------ %d " % (cnt, cname, res.code)
            self.add_job({'line': line, 'cnt': cnt})
            time.sleep(1)
            return False
        elif res.code == 200:
            c = eval(res.text)['c']
            if len(c) == 0:
                print '-----------------------------cname %s res.text is null----------------------------' % cname
                self.query_company_info_failure.append(line)
                return True
            result = CCIQ_AES("BB1856A312580D41256311147089E0CC").decrypt(c)
            dic = eval(result)
            list = dic['list']
            if len(list) == 0:
                print 'cname %s result list length = 0 ' % cname
                self.query_company_info_failure.append(line)
                return True
            print 'cname %s result ###################  list length ------ %d' % (
                cname, len(list))
            for l in list:
                aa = {}
                for k, v in l.items():
                    aa[k] = v
                self.query_company_info.append(spider.util.utf8str(aa))
                part = cname + "|" + l['oc_name'] + "|" + str(
                    l['oc_area']) + "|" + str(l['oc_code']) + "|" + str(
                        l['oc_number'])
                self.query_company_info_part.append(part)
                self.get_detail(l['oc_name'], l['oc_code'], l['oc_area'])
            if len(list) < 20:
                return True
            elif len(list) == 20:
                now_page += 1
                self.flip_over(now_page, cname, line, cnt)
        else:
            print "cname %s #######################################UNKNOWN ERROR############################################# [ %d ]" % (
                cname, res.code)
            self.query_company_info_failure.append(line)
            return True

    #查询详细信息
    def get_detail(self, cname, code, area):
        url = "http://appsvc.qiye.qianzhan.com/OrgCompany.svc/orgcompany/combine/detail"
        headers = {"Content-Type": "application/json"}
        encryptedJson = {
            "bl_oc_code": code,  #"71526726X"
            "v1": "QZOrgV004",
            "isDirect": "1",
            "bl_oc_name": cname,  #"腾讯科技"
            "bl_oc_area": area  #"4403"
        }

        param = {
            "encryptedJson":
            self._aes_.encrypt(spider.util.utf8str(encryptedJson)),
            "extJson": self.extJson
        }
        param = spider.util.utf8str(param)
        res = self.request_url(url,
                               headers=headers,
                               data=param,
                               proxies=self.proxies_dict[self.get_tid()])

        if res is None:
            print 'res is none -- encryptedJson -->', str(encryptedJson)
            self.detail_failure.append(cname + "|" + str(code) + "|" +
                                       str(area))
            return
        elif res.code == 404:
            print "404 ------ ", code
            self.detail_failure.append(cname + "|" + str(code) + "|" +
                                       str(area))
            return
        elif res.code == 503 or res.code == 500 or res.code == 502 or res.code == 504:
            print res.code, '------', code
            time.sleep(0.5)
            self.get_detail(cname, code, area)
            return
        elif res.code == 200:
            c = eval(res.text)['c']
            if len(c) == 0:
                print '-----------------------------code ', code, ' res.text is null----------------------------'
                self.detail_failure.append(cname + "|" + str(code) + "|" +
                                           str(area))
                return
            result = CCIQ_AES("BB1856A312580D41256311147089E0CC").decrypt(c)
            detail = eval(result)
            listGD = self.get_gd(area, code)
            if listGD is not None:
                detail['listGD'] = listGD['listGD']
            print 'detail=================================', spider.util.utf8str(
                detail)
            self.detail_company.append(spider.util.utf8str(detail))
            return
        else:
            print "cname %s #######################################UNKNOWN ERROR############################################# [ %d ]" % (
                cname, res.code)

    #获取股东信息
    def get_gd(self, area, code):
        url = "http://appsvc.qiye.qianzhan.com/OrgCompany.svc/orgcompany/gd/detail"
        headers = {"Content-Type": "application/json"}

        encryptedJson = {
            "bl_oc_area": area,  #4107
            "v1": "QZOrgV004",
            "bl_oc_code": code  #672867774
        }

        param = {
            "encryptedJson":
            self._aes_.encrypt(spider.util.utf8str(encryptedJson)),
            "extJson": self.extJson
        }
        param = spider.util.utf8str(param)
        res = self.request_url(url,
                               headers=headers,
                               data=param,
                               proxies=self.proxies_dict[self.get_tid()])

        if res is None:
            print 'res is none -- search gd code is -->', code
            return None
        elif res.code == 404:
            print "404 ------ ", code
            return None
        elif res.code == 503 or res.code == 500 or res.code == 502 or res.code == 504:
            print res.code, '------', code
            time.sleep(0.5)
            return self.get_gd(area, code)
        elif res.code == 200:
            c = eval(res.text)['c']
            if len(c) == 0:
                print '-----------------------------gd code', code, ' res.text is null----------------------------' % cname
                return None
            result = CCIQ_AES("BB1856A312580D41256311147089E0CC").decrypt(c)
            list_gd = eval(result)
            #print 'gd infos =======================',spider.util.utf8str(list_gd)
            return list_gd
        else:
            print code, "#######################################UNKNOWN ERROR#############################################", res.code
        return None

    def get_inversted(self, url, encryptedJson):
        """
        通用请求方法
        """

        param = spider.util.utf8str({
            "encryptedJson":
            self._aes_.encrypt(spider.util.utf8str(encryptedJson)),
            "extJson":
            self.extJson
        })

        res = self.request_url(url,
                               headers={"Content-Type": "application/json"},
                               data=param,
                               proxies=self.proxies_dict[self.get_tid()])

        if res is None:
            print 'res is none -- search gd code is -->', code
            return None
        elif res.code == 404:
            print "404 ------ ", code
            return None
        elif res.code == 503 or res.code == 500 or res.code == 502 or res.code == 504:
            print res.code, '------', code
            time.sleep(0.5)
            return self.get_gd(area, code)
        elif res.code == 200:
            c = eval(res.text)['c']
            if len(c) == 0:
                print '-----------------------------gd code', code, ' res.text is null----------------------------' % cname
                return None
            result = CCIQ_AES("BB1856A312580D41256311147089E0CC").decrypt(c)
            list_gd = eval(result)
            #print 'gd infos =======================',spider.util.utf8str(list_gd)
            return list_gd
        else:
            print code, "#######################################UNKNOWN ERROR#############################################", res.code
        return None

    def event_handler(self, evt, msg, **kwargs):
        if evt == 'DONE':
            msg += '企业查询宝APP[公司名]和[组织机构代码]爬取已经停止...'
            spider.util.sendmail('*****@*****.**', '%s DONE' % sys.argv[0],
                                 msg)

    def read_proxy(self, fn):
        with open(fn, 'r') as f:
            for line in f:
                line = line.strip()
                m = re.match(r'(\d+\.\d+\.\d+\.\d+:\d+)', line, re.I)
                if m:
                    prstr = m.group(1)
                    proxies = {
                        'http': 'http://' + prstr + "/",
                        'https': 'https://' + prstr + "/"
                    }
                    self.proxies_dict.append(proxies)
                elif re.match('\s*#', line):
                    continue
        print " loaded [ %d ] proxis " % len(self.proxies_dict)
Пример #25
0
class QycxbSpider(Spider):
    """
    测试只使用9位数组织机构代码去获取详情 121.40.186.237:18889:ipin:helloipin
    """
    def __init__(self):
        self._can_use_proxy_num = 0
        self.is_debug = "multiADSL"
        self.proxies = {}
        if self.is_debug == "singleADSL":
            #单一代理ADSL模式
            Spider.__init__(self, 200)
            self.proxy_error_cnt = 0
        elif self.is_debug == "kuaidaili":
            #快代理模式
            self.proxies_dict = []
            self.read_proxy("../../_ct_proxy/proxy_all_filter.txt")
            Spider.__init__(self, len(self.proxies_dict))
        elif self.is_debug == "multiADSL":
            #多代理ADSL模式
            #proxies1 = {'http': 'http://*****:*****@183.56.160.174:50001', 'https': 'https://*****:*****@183.56.160.174:50001'}
            #proxies2 = {'http': 'http://*****:*****@183.56.160.174:50001', 'https': 'https://*****:*****@183.56.160.174:50001'}
            proxies1 = {
                'http': 'http://*****:*****@121.40.186.237:50001',
                'https': 'https://*****:*****@121.40.186.237:50001'
            }
            proxies2 = {
                'http': 'http://*****:*****@121.40.186.237:50001',
                'https': 'https://*****:*****@121.40.186.237:50001'
            }
            proxies3 = {
                'http': 'http://*****:*****@192.168.1.39:3428',
                'https': 'https://*****:*****@192.168.1.39:3428'
            }
            proxies4 = {
                'http': 'http://*****:*****@192.168.1.39:3428',
                'https': 'https://*****:*****@192.168.1.39:3428'
            }
            proxies5 = {
                'http': 'http://*****:*****@192.168.1.39:3428',
                'https': 'https://*****:*****@192.168.1.39:3428'
            }
            self.proxies_dict = [proxies1, proxies2, proxies3,
                                 proxies4]  #, proxies5]
            Spider.__init__(self, 400)
        self._aes_ = CCIQ_AES()
        #成功拿到的详情
        self.query_success = FileSaver("成功拿到的详情900.txt")
        #失败的
        self.query_failure = FileSaver("获取失败的机构代码和原因900.txt")
        #已经爬取过的列表
        self.already_cname_list = FileSaver("已经爬过机构代码900.txt")
        #结果http 为400的code
        self.result400 = FileSaver("结果http=400的机构代码900.txt")
        #初始化已经爬过的公司
        self.init_cname()

        self.extJsons = [
            "Hoi6oX70l9whauZmjq8jVAmoe3UspXXhX9mPG+KAeqs1rKZVr/uapICH92P/Crryt63u28aP4QP665AzcT/jN5Go1o3bvwMvVIkuN9e60k6WI2pVFBrwZMvxwW6BnQukSzDSlyPvEhgpR5DIHQEV6C51hMgp4Zc3OkTSsyezAm4=",
            "ctlCXDvoyaH2pCIArrgvXp7zrZTzpz2Q5rukh+aWvupEFABw6P2AvbmaN+HJ7IZgDJ/kgBkJt/rLppSGitYCPKGR2IGv6OXZsrJGgbRB3G3Ac4K8xpX3aMB5s8Ci2a/YpTpioZxAvptqJsQUCoNn0tLCOVM4XxMJQWbrErkOcl4=",
            "ctlCXDvoyaH2pCIArrgvXp7zrZTzpz2Q5rukh+aWvupEFABw6P2AvbmaN+HJ7IZgDJ/kgBkJt/rLppSGitYCPKGR2IGv6OXZsrJGgbRB3G1U2wdOlL49/aDwt3NZNp4TGa5iBFpYLm69F/6PPFoXIR/Aw5p48//8OgZFpddDUwQ="
        ]

        self.user_agents = [
            "=CCIQ/2.0.1 (iPhone; iOS 9.1; Scale/2.00)",
            "=CCIQ/2.0.1 (iPhone; iOS 8.1.3; Scale/2.00)",
            "=CCIQ/2.0.1 (iPhone; iOS 8.4; Scale/2.00)"
        ]
        self.is_first = True
        self.init_time = 0
        self.lock = threading.Lock()
        self.req_cnt = 0

    def req_all(self, url, encryptedJson, retry=0):
        number = random.randrange(0, 3, 1)
        self.select_user_agent(self.user_agents[number])
        param = spider.util.utf8str({
            "encryptedJson":
            self._aes_.encrypt(spider.util.utf8str(encryptedJson)),
            "extJson":
            self.extJsons[number]
        })
        param = param.replace('/', "\/")
        res = None
        if self.is_first:
            self.init_time = time.time()
            print '初始化时间', self.init_time
            self.is_first = False
        if self.is_debug == "singleADSL":
            #res = self.request_url(url, headers={"Content-Type": "application/json"}, timeout=20, data=param)
            self.proxies = {
                'http': 'http://*****:*****@192.168.1.39:3428',
                'https': 'https://*****:*****@192.168.1.39:3428'
            }
            #self.proxies = {'http': 'http://*****:*****@183.56.160.174:50001', 'https': 'https://*****:*****@183.56.160.174:50001'}
            #self.proxies = {'http': 'http://*****:*****@121.40.186.237:50001', 'https': 'https://*****:*****@121.40.186.237:50001'}
            #self.proxies = {'http': 'http://*****:*****@106.75.134.190:18889', 'https': 'https://*****:*****@106.75.134.190:18889'}
            res = self.request_url(
                url,
                headers={"Content-Type": "application/json"},
                timeout=20,
                data=param,
                proxies=self.proxies)
            time.sleep(5)
            #res = self.request_url(url, headers={"Content-Type": "application/json", "Accept-Language": "zh-Hans-CN;q=1"}, data=param, proxies={'http': 'http://*****:*****@106.75.134.191:18889', 'https': 'https://*****:*****@106.75.134.191:18889'})
        elif self.is_debug == "kuaidaili":
            self.proxies = self.proxies_dict[self.get_tid()]
            res = self.request_url(
                url,
                headers={"Content-Type": "application/json"},
                data=param,
                proxies=self.proxies)
        elif self.is_debug == "multiADSL":
            num = self.get_tid() % len(self.proxies_dict)
            self.proxies = self.proxies_dict[num]
            res = self.request_url(
                url,
                headers={"Content-Type": "application/json"},
                data=param,
                proxies=self.proxies_dict[num],
                timeout=20)

        if res is None or res.code != 200:
            print "访问错误", "res is none" if res is None else "res.code=%d" % (
                res.code), self.proxies
            if res is not None and res.code == 400 and retry > 2:
                return res
            #self.error_add()
            if retry < 7:
                time.sleep(random.randrange(1, 5, 1))
                return self.req_all(url, encryptedJson, retry=(retry + 1))
        return res

    def init_cname(self):
        cnt = 0
        with open("已经爬过机构代码900.txt", "r") as f:
            for line in f:
                cnt += 1
                filter_name.add(line.strip())
        print "初始化结束...", cnt
        with open("结果http=400的机构代码900.txt", "r") as f:
            for line in f:
                cnt += 1
                filter_name.add(line.strip())
        print "初始化结束...", cnt

    def count_proxy_error(self, error_type):
        pass

    def wait_q_breakable(self):
        lt = 0
        while True:
            if not self.job_queue.empty() or not self.job_queue2.empty(
            ) or not self.job_queue3.empty():
                time.sleep(5)
            if time.time() < lt + 1 and self._running_count == 0:
                return True
            time.sleep(2)
            lt = time.time()
            if self._worker_count == 0:
                return False

    def dispatch(self):
        # i = 0
        # while i < 100000000:
        #     i += 1
        #     code = self.bu0(i)
        #     if len(code) == 9 and code not in filter_name:
        #         job = {"code": code, "retry": 0}
        #         self.add_job(job, True)
        #     else:
        #         print "已爬过 或 代码错误:", code
        with open("推测组织机构代码.txt", "r") as f:
            for line in f:
                line = line.strip()
                if line in filter_name:
                    #print "already query...", line
                    continue
                else:
                    job = {"code": line, "retry": 0}
                    self.add_job(job, True)
        self.wait_q_breakable()
        self.add_job(None, True)

    # def bu0(self, code):
    #     code = str(code)
    #     if len(code) != 8:
    #         sub = 8 - len(code)
    #         while sub != 0:
    #             code = "0" + code
    #             sub -= 1
    #     code = self.compute_code(code)
    #     return code
    #
    # def compute_code(self, code):
    #     code = code.strip()
    #     assert len(code) == 8
    #     vs = [3, 7, 9, 10, 5, 8, 4, 2]
    #     v = 0
    #     for i in range(0, 8):
    #         if '0' <= code[i] <= '9':
    #             v += (ord(code[i]) - ord('0')) * vs[i]
    #         elif 'A' <= code[i] <= 'Z':
    #             v += (ord(code[i]) - ord('A') + 10) * vs[i]
    #         elif 'a' <= code[i] <= 'z':
    #             v += (ord(code[i]) - ord('a') + 10) * vs[i]
    #         else:
    #             raise RuntimeError("invalid code")
    #     v = (11 - v % 11) % 11
    #     return code + '0123456789X'[v]

    def record_spider(self, code):
        """
        已经爬过的,无论成功失败都算爬过.
        """
        filter_name.add(code)
        self.already_cname_list.append(code)
        self.proxy_error_cnt = 0
        self.req_cnt += 1
        print "speed ======================>", self.req_cnt / (time.time() -
                                                               self.init_time)

    def error_add(self):
        pass
        # with self.lock:
        #     self.proxy_error_cnt += 1
        #     if self.proxy_error_cnt > 200:
        #         self.restart_jb()

    def restart_jb(self):
        if self.proxy_error_cnt < 200:
            return
        self.proxy_error_cnt = 0
        print "=============================重新启动拨号脚本================================="
        os.system(
            "sshpass -p 'helloipin' ssh [email protected] /home/ipin/bin/redial"
        )
        time.sleep(10)
        os.system(
            "sshpass -p 'helloipin' ssh [email protected] /home/ipin/bin/getip"
        )
        print "=============================重新启动拨号脚本成功=============================="

    def run_job(self, jobid):
        code = jobid.get("code")
        retry = jobid.get("retry")

        tid = self.get_tid()
        url = "http://appsvc.qiye.qianzhan.com/OrgCompany.svc/orgcompany/combine/detail"
        encryptedJson = {
            "bl_oc_code": code,  #code,  #"71526726X"
            "v1": "QZOrgV005",
            "isDirect": "0",
            "bl_oc_name": "腾讯科技",  #cname,  #"腾讯科技"
            "bl_oc_area": ""  #area #"4403"
        }
        detail = {}
        res = self.req_all(url, encryptedJson)
        res_code = 0
        if res is None:
            print code, "get detail     res is None !!"
            return
        res_code = res.code
        if res_code == 400:
            self.result400.append(code)
            self.req_cnt += 1
            return
        try:
            if u"服务不可用。" in res.text or u"Unauthorized!" in res.text:  # or u"处理请求时服务器遇到错误。有关详细信息,请参见服务器日志。" in res.text:
                self.re_add_job({'cname': code, 'retry': retry})
                print "系统不可用...", code, res.text
                return
            c = eval(res.text)['c']
        except Exception as err:
            print "tid=%d --- retry=%d --- res.code=%d  exception " % (
                tid, retry,
                res_code), err  #res.text=%s#, spider.util.utf8str(res.text)
            self.re_add_job({'cname': code, 'retry': retry})
            return
        if len(c) == 0:
            print "tid=%d --- retry=%d --- res.code=%d   --- exception 'C' IS NULL" % (
                tid, retry, res_code)
            self.query_failure.append(code + ",c=0")
            self.record_spider(code)
            return
        result = CCIQ_AES("BF1856A312580D41256311147089E1CC").decrypt(c)
        try:
            detail = eval(result)
        except Exception as err:
            print "tid=%d --- retry=%d --- res.code=%d  --- exception result:%s" % (
                tid, retry, res_code, result)
            self.query_failure.append(code + ",result_error")
            self.record_spider(code)
            return

        cname = None
        try:
            basic = detail["list"]
            if basic is None or len(basic) == 0:
                print code, " 此码无效...", spider.util.utf8str(detail)
                self.query_failure.append(code + ",list=0")
                self.record_spider(code)
                return
            cname = basic[0]["oc_name"]
        except Exception as err:
            print code, "获取基本详情错误,拿不到oc_name,detail : ", spider.util.utf8str(
                detail)
            return

        #股东信息
        # listGD = self.get_gd(code)
        # if listGD is not None:
        #     #print "tid=", tid, " listGD=", spider.util.utf8str(listGD)
        #     detail['listGD'] = listGD['listGD']

        #投资信息
        # list_inversted = self.get_inversted(cname)
        # if list_inversted is not None:
        #     #print "tid=", tid, " list_inversted=", spider.util.utf8str(list_inversted)
        #     detail['inversted'] = list_inversted['inversted']

        # #获取分支机构信息
        # branch = []
        # list_branch = self.get_branch(cname, list_branch=branch)
        # if list_branch is not None:
        #     #print "tid=", tid, " list_branch=", spider.util.utf8str(list_branch)
        #     detail['Branch'] = list_branch #['Branch']
        self.query_success.append(spider.util.utf8str(detail))
        self.record_spider(code)

        print "tid=%d --- retry=%d --- res.code=%d  @@@ success: %s \n " % (
            tid, retry, res_code, spider.util.utf8str(
                self.proxies)), spider.util.utf8str(detail)

    def get_gd(self, code, retry=0):
        """
        获取股东信息
        """
        url = "http://appsvc.qiye.qianzhan.com/OrgCompany.svc/orgcompany/gd/detail"
        encryptedJson = {
            "bl_oc_area": "",
            "v1": "QZOrgV005",
            "bl_oc_code": code
        }
        res = self.req_all(url, encryptedJson)
        if res is None:
            return None
        if res.code == 200:
            try:
                c = eval(res.text)['c']
                if len(c) == 0:
                    print "get_gd --- retry=%d --- reason:len(c)=0" % retry
                    return None
                result = CCIQ_AES("BF1856A312580D41256311147089E1CC").decrypt(
                    c)
                #print "获取股东信息结果:", spider.util.utf8str(result)
                return eval(result)
            except Exception as err:
                print "get_gd --- retry=%d --- reason:%s" % (retry, err)
                if retry < 5:
                    retry += 1
                    time.sleep(retry * 1.5)
                    return self.get_gd(code, retry=retry)
                else:
                    return None
        else:
            print "get_gd --- retry=%d --- res.code=%d" % (retry, res.code)
            if retry < 5:
                retry += 1
                time.sleep(retry * 1.5)
                return self.get_gd(code, retry=retry)
            else:
                return None

    def get_inversted(self, cname, retry=0):
        """
        查询投资信息
        """
        url = "http://appsvc.qiye.qianzhan.com/OrgCompany.svc/orgcompany/map/invesment"
        encryptedJson = {"input": cname, "v1": "QZOrgV005"}

        res = self.req_all(url, encryptedJson)
        if res is None:
            return None
        if res.code == 200:
            try:
                c = eval(res.text)['c']
                if len(c) == 0:
                    print "get_inversted --- cname=%s --- retry=%d --- reason:len(c)=0" % (
                        cname, retry)
                    return None
                result = CCIQ_AES("BF1856A312580D41256311147089E1CC").decrypt(
                    c)
                return eval(result)
            except Exception as err:
                print "get_inversted --- cname=%s --- retry=%d --- reason:%s" % (
                    cname, retry, err)
                if retry < 5:
                    retry += 1
                    time.sleep(retry * 1.5)
                    return self.get_inversted(cname, retry=retry)
                else:
                    return None
        else:
            print "get_inversted --- cname=%s --- retry=%d --- res.code=%d" % (
                cname, retry, res.code)
            if retry < 5:
                retry += 1
                time.sleep(retry * 1.5)
                return self.get_inversted(cname, retry=retry)
            else:
                return None

    def get_branch(self, cname, now_page=1, list_branch=[], retry=0):
        """
        查询分支机构
        """
        url = "http://appsvc.qiye.qianzhan.com/OrgCompany.svc/orgcompany/branch/select/page"
        encryptedJson = {
            "companyName": cname,
            "v1": "QZOrgV005",
            "page": now_page,
            "pagesize": "10"
        }

        res = self.req_all(url, encryptedJson)
        if res is None:
            return None
        if res.code == 200:
            try:
                c = eval(res.text)['c']
                if len(c) == 0:
                    print "get_branch --- cname=%s --- retry=%d --- reason:len(c)=0" % (
                        cname, retry)
                    return None
                result = CCIQ_AES("BF1856A312580D41256311147089E1CC").decrypt(
                    c)
                temp = eval(result)
                if temp is not None:
                    for t in temp['Branch']:
                        list_branch.append(t)
                    if len(temp['Branch']) == 10:
                        if now_page > 3:
                            return list_branch
                        now_page += 1
                        print cname, "翻页 -----------------------------------> now_page", now_page
                        return self.get_branch(cname,
                                               now_page=now_page,
                                               list_branch=list_branch,
                                               retry=retry)
                    else:
                        return list_branch
                else:
                    print "get_branch --- cname=%s --- retry=%d --- now_page=%d --- res.code=%d --- Branch is NULL" % (
                        cname, retry, now_page)
                    return None
            except Exception as err:
                print "get_branch --- cname=%s --- retry=%d --- reason:%s" % (
                    cname, retry, err)
                if retry < 5:
                    retry += 1
                    time.sleep(retry * 1.5)
                    return self.get_branch(cname,
                                           now_page=now_page,
                                           list_branch=list_branch,
                                           retry=retry)
                else:
                    return None
        else:
            print "get_branch --- cname=%s --- retry=%d --- res.code=%d" % (
                cname, retry, res.code)
            if retry < 5:
                retry += 1
                time.sleep(retry * 1.5)
                return self.get_branch(cname,
                                       now_page=now_page,
                                       list_branch=list_branch,
                                       retry=retry)
            else:
                return None

    def get_fail_cnt(self, addv, type):
        fc = getattr(self._curltls, type, 0)
        if (addv):
            fc += addv
            setattr(self._curltls, type, fc)
        return fc

    def event_handler(self, evt, msg, **kwargs):
        if evt == 'DONE':
            msg += '企业查询宝APP公司详情detail查询已经停止...'
            spider.util.sendmail('*****@*****.**', '%s DONE' % sys.argv[0],
                                 msg)

    def read_proxy(self, fn):
        with open(fn, 'r') as f:
            for line in f:
                line = line.strip()
                self._match_proxy(line)
        print " loaded [ %d ] proxis " % len(self.proxies_dict)

    def _match_proxy(self, line):
        m = re.match('([0-9.]+):(\d+):([a-z0-9]+):([a-z0-9._-]+)$', line, re.I)
        m1 = re.match('([0-9.]+):(\d+):([a-z0-9]+)$', line, re.I)
        if m:
            prstr = '%s:%s@%s:%s' % (m.group(3), m.group(4), m.group(1),
                                     m.group(2))
            proxies = {'http': 'http://' + prstr, 'https': 'https://' + prstr}
        elif m1:
            prstr = '%s:%s' % (m1.group(1), m1.group(2))
            proxies = {'http': 'http://' + prstr, 'https': 'https://' + prstr}
        else:
            proxies = {'http': 'http://' + line, 'https': 'https://' + line}
        self.proxies_dict.append(proxies)
Пример #26
0
    def __init__(self):
        self._can_use_proxy_num = 0
        self.is_debug = "multiADSL"
        self.proxies = {}
        if self.is_debug == "singleADSL":
            #单一代理ADSL模式
            Spider.__init__(self, 200)
            self.proxy_error_cnt = 0
        elif self.is_debug == "kuaidaili":
            #快代理模式
            self.proxies_dict = []
            self.read_proxy("../../_ct_proxy/proxy_all_filter.txt")
            Spider.__init__(self, len(self.proxies_dict))
        elif self.is_debug == "multiADSL":
            #多代理ADSL模式
            #proxies1 = {'http': 'http://*****:*****@183.56.160.174:50001', 'https': 'https://*****:*****@183.56.160.174:50001'}
            #proxies2 = {'http': 'http://*****:*****@183.56.160.174:50001', 'https': 'https://*****:*****@183.56.160.174:50001'}
            proxies1 = {
                'http': 'http://*****:*****@121.40.186.237:50001',
                'https': 'https://*****:*****@121.40.186.237:50001'
            }
            proxies2 = {
                'http': 'http://*****:*****@121.40.186.237:50001',
                'https': 'https://*****:*****@121.40.186.237:50001'
            }
            proxies3 = {
                'http': 'http://*****:*****@192.168.1.39:3428',
                'https': 'https://*****:*****@192.168.1.39:3428'
            }
            proxies4 = {
                'http': 'http://*****:*****@192.168.1.39:3428',
                'https': 'https://*****:*****@192.168.1.39:3428'
            }
            proxies5 = {
                'http': 'http://*****:*****@192.168.1.39:3428',
                'https': 'https://*****:*****@192.168.1.39:3428'
            }
            self.proxies_dict = [proxies1, proxies2, proxies3,
                                 proxies4]  #, proxies5]
            Spider.__init__(self, 400)
        self._aes_ = CCIQ_AES()
        #成功拿到的详情
        self.query_success = FileSaver("成功拿到的详情900.txt")
        #失败的
        self.query_failure = FileSaver("获取失败的机构代码和原因900.txt")
        #已经爬取过的列表
        self.already_cname_list = FileSaver("已经爬过机构代码900.txt")
        #结果http 为400的code
        self.result400 = FileSaver("结果http=400的机构代码900.txt")
        #初始化已经爬过的公司
        self.init_cname()

        self.extJsons = [
            "Hoi6oX70l9whauZmjq8jVAmoe3UspXXhX9mPG+KAeqs1rKZVr/uapICH92P/Crryt63u28aP4QP665AzcT/jN5Go1o3bvwMvVIkuN9e60k6WI2pVFBrwZMvxwW6BnQukSzDSlyPvEhgpR5DIHQEV6C51hMgp4Zc3OkTSsyezAm4=",
            "ctlCXDvoyaH2pCIArrgvXp7zrZTzpz2Q5rukh+aWvupEFABw6P2AvbmaN+HJ7IZgDJ/kgBkJt/rLppSGitYCPKGR2IGv6OXZsrJGgbRB3G3Ac4K8xpX3aMB5s8Ci2a/YpTpioZxAvptqJsQUCoNn0tLCOVM4XxMJQWbrErkOcl4=",
            "ctlCXDvoyaH2pCIArrgvXp7zrZTzpz2Q5rukh+aWvupEFABw6P2AvbmaN+HJ7IZgDJ/kgBkJt/rLppSGitYCPKGR2IGv6OXZsrJGgbRB3G1U2wdOlL49/aDwt3NZNp4TGa5iBFpYLm69F/6PPFoXIR/Aw5p48//8OgZFpddDUwQ="
        ]

        self.user_agents = [
            "=CCIQ/2.0.1 (iPhone; iOS 9.1; Scale/2.00)",
            "=CCIQ/2.0.1 (iPhone; iOS 8.1.3; Scale/2.00)",
            "=CCIQ/2.0.1 (iPhone; iOS 8.4; Scale/2.00)"
        ]
        self.is_first = True
        self.init_time = 0
        self.lock = threading.Lock()
        self.req_cnt = 0
Пример #27
0
class QycxbQuery(Spider):
    """
    根据企业名称.查询企业列表------针对900多万的公司名查询
    """
    def __init__(self):
        self.is_debug = False
        self._can_use_proxy_num = 0
        if self.is_debug:
            Spider.__init__(self, 1)
        else:
            self.proxies_dict = []
            self.read_proxy("../../_ct_proxy/proxy_all_filter.txt")
            Spider.__init__(self, len(self.proxies_dict))
        self.error_cnt = 0
        self._aes_ = CCIQ_AES()
        #根据公司名字查询到的公司列表全部信息
        self.query_company_list = FileSaver("all_company_list.txt")

        #已经爬取过的公司名
        self.already_cname_list = FileSaver("all_company_list_already.txt")

        #爬过的 错误类型
        self.already_error_type = FileSaver("all_already_error_type.txt")

        #初始化已经爬过的公司
        self.init_cname()
        self.extJsons = [
            "Hoi6oX70l9whauZmjq8jVAmoe3UspXXhX9mPG+KAeqs1rKZVr/uapICH92P/Crryt63u28aP4QP665AzcT/jN5Go1o3bvwMvVIkuN9e60k6WI2pVFBrwZMvxwW6BnQukSzDSlyPvEhgpR5DIHQEV6C51hMgp4Zc3OkTSsyezAm4=",
            "ctlCXDvoyaH2pCIArrgvXp7zrZTzpz2Q5rukh+aWvupEFABw6P2AvbmaN+HJ7IZgDJ/kgBkJt/rLppSGitYCPKGR2IGv6OXZsrJGgbRB3G3Ac4K8xpX3aMB5s8Ci2a/YpTpioZxAvptqJsQUCoNn0tLCOVM4XxMJQWbrErkOcl4=",
            "ctlCXDvoyaH2pCIArrgvXp7zrZTzpz2Q5rukh+aWvupEFABw6P2AvbmaN+HJ7IZgDJ/kgBkJt/rLppSGitYCPKGR2IGv6OXZsrJGgbRB3G1U2wdOlL49/aDwt3NZNp4TGa5iBFpYLm69F/6PPFoXIR/Aw5p48//8OgZFpddDUwQ="
        ]

        self.user_agents = [
            "=CCIQ/2.0.1 (iPhone; iOS 9.1; Scale/2.00)",
            "=CCIQ/2.0.1 (iPhone; iOS 8.1.3; Scale/2.00)",
            "=CCIQ/2.0.1 (iPhone; iOS 8.4; Scale/2.00)"
        ]

        self.bloom = set()

    def req_all(self, encryptedJson):
        url = "http://appsvc.qiye.qianzhan.com/OrgCompany.svc/orgcompany/combine/search"
        number = random.randrange(0, 3, 1)
        self.select_user_agent(self.user_agents[number])
        param = spider.util.utf8str({
            "encryptedJson":
            self._aes_.encrypt(spider.util.utf8str(encryptedJson)),
            "extJson":
            self.extJsons[number]
        })
        param = param.replace('/', "\/")
        try:
            if self.is_debug:
                res = self.request_url(
                    url,
                    headers={
                        "Content-Type": "application/json",
                        "Accept-Language": "zh-Hans-CN;q=1"
                    },
                    data=param,
                    proxies={
                        'http': 'http://*****:*****@121.41.79.4:18889',
                        'https': 'https://*****:*****@121.41.79.4:18889'
                    })
                #res = self.request_url(url, headers={"Content-Type": "application/json"}, data=param, proxies={'http': 'http://104.236.48.178:8080', 'https': 'https://104.236.48.178:8080'})
            else:
                res = self.request_url(
                    url,
                    headers={
                        "Content-Type": "application/json",
                        "Accept-Language": "zh-Hans-CN;q=1"
                    },
                    data=param,
                    proxies=self.proxies_dict[self.get_tid()])
            if res is not None and res.code == 200:
                time.sleep(random.randrange(30, 50, 1))
            else:
                time.sleep(5)
            return res
        except Exception as err:
            proxies = self.proxies_dict[self.get_tid()]
            print proxies[
                'http'], "@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@\n error = ", err

    def init_cname(self):
        with open("all_company_list_already.txt", "r") as f:
            for line in f:
                filter_name.add(line.strip())

    def wait_q_breakable(self):
        lt = 0
        while True:
            if not self.job_queue.empty() or not self.job_queue2.empty(
            ) or not self.job_queue3.empty():
                time.sleep(5)
            if time.time() < lt + 1 and self._running_count == 0:
                return True
            time.sleep(2)
            lt = time.time()
            if self._worker_count == 0:
                return False

    def dispatch(self):
        with open("beijing_cname.txt", "r") as f:
            cnt = 0
            for line in f:
                line = line.strip()
                cnt += 1
                if line in filter_name:
                    #print cnt, line, "already spider!!!"
                    continue
                job = {"cname": line, "cnt": cnt, "retry": 0}
                self.add_job(job, True)
        self.wait_q_breakable()
        self.add_job(None, True)

    def record_spider(self, line):
        """
        已经爬过的,无论成功失败都算爬过.
        """
        filter_name.add(line)
        self.already_cname_list.append(line)

    def run_job(self, job):
        cname = job.get("cname")
        cnt = job.get("cnt")
        retry = job.get("retry")
        if cname is None:
            print 'cname = ', cnt, ' is None ,break~'
            return
        self.flip_over(1, cname, cnt, retry)

    def flip_over(self, now_page, cname, cnt, retry):
        tid = self.get_tid()
        """
        根据公司名查询公司列表,翻页
        """
        encryptedJson = {
            "pagesize": "20",
            "page": now_page,
            "od_orderBy": "0",
            "sh_searchType": "一般搜索",
            "sh_oc_areaName": "",
            "od_statusFilter": "0",
            "v1": "QZOrgV005",
            "oc_name": cname,
            "sh_u_uid": "",
            "sh_u_name": ""
        }
        r_result = {"cname": cname}
        res = self.req_all(encryptedJson)
        res_code = 0
        if res is None:
            if self.get_fail_cnt('failcount-none', 1) < 10:
                self.re_add_job({'cname': cname, 'cnt': cnt, 'retry': retry})
                print "tid=%d --- cnt=%d --- cname=%s --- retry=%d --- res.code=%d --- now_page:%d" % (
                    tid, cnt, cname, retry, res_code, now_page)
                return
            else:
                # if retry > 5:
                #     r_result["type"] = "None"
                #     self.already_error_type.append(spider.util.utf8str(r_result))
                #     self.record_spider(cname)
                #     print "tid=%d --- cnt=%d --- cname=%s --- retry=%d --- res.code=%d --- now_page:%d" % (tid, cnt, cname, retry, res_code, now_page)
                # else:
                #     self.re_add_job({'cname':cname,'cnt':cnt, 'retry':(retry+1)})
                self.re_add_job({'cname': cname, 'cnt': cnt, 'retry': retry})
                raise AccountErrors.NoAccountError(
                    "Maybe the proxy invalid,failcount-none = [ %d ],tid=[ %d ]"
                    % (self.get_fail_cnt('failcount-none', 0), tid))
        else:
            setattr(self._curltls, 'failcount-none', 0)

        res_code = res.code

        if (res_code >= 400 and res_code < 500) or res_code == 202:
            if self.get_fail_cnt('failcount-400', 1) < 5:
                self.re_add_job({'cname': cname, 'cnt': cnt, 'retry': retry})
                print "tid=%d --- cnt=%d --- cname=%s --- retry=%d --- res.code=%d --- now_page:%d" % (
                    tid, cnt, cname, retry, res_code, now_page)
                return
            else:
                if retry > 5:
                    r_result["type"] = "400+"
                    self.already_error_type.append(
                        spider.util.utf8str(r_result))
                    self.record_spider(cname)
                else:
                    self.re_add_job({
                        'cname': cname,
                        'cnt': cnt,
                        'retry': (retry + 1)
                    })
                    self._can_use_proxy_num -= 1
                raise AccountErrors.NoAccountError(
                    "Maybe the proxy invalid,failcount-400 = [ %d ],tid=[ %d ]"
                    % (self.get_fail_cnt('failcount-400', 0), tid))
        else:
            setattr(self._curltls, 'failcount-400', 0)

        if res_code >= 500:
            # if retry > 2:
            #     r_result["type"]="500"
            #     self.already_error_type.append(spider.util.utf8str(r_result))
            #     self.record_spider(cname)
            # else:
            self.re_add_job({'cname': cname, 'cnt': cnt, 'retry': retry})
            print "tid=%d --- cnt=%d --- cname=%s --- retry=%d --- res.code=%d  --- now_page:%d " % (
                tid, cnt, cname, retry, res_code, now_page)
            time.sleep(random.randrange(1, 10, 1))
            return
        elif res_code == 200:
            try:
                c = eval(res.text)['c']
            except Exception as err:
                print "tid=%d --- cnt=%d --- cname=%s --- retry=%d --- res.code=%d  --- now_page:%d --- exception res.text - %s" % (
                    tid, cnt, cname, retry, res_code, now_page, err)
                # r_result["type"] = "res_error"
                # self.already_error_type.append(spider.util.utf8str(r_result))
                # self.record_spider(cname)
                # self.error_cnt += 1
                self.re_add_job({'cname': cname, 'cnt': cnt, 'retry': retry})
                return
            if len(c) == 0:
                print "tid=%d --- cnt=%d --- cname=%s --- retry=%d --- res.code=%d  --- now_page:%d --- exception 'C' IS NULL" % (
                    tid, cnt, cname, retry, res_code, now_page)
                r_result["type"] = "c=0"
                self.already_error_type.append(spider.util.utf8str(r_result))
                self.record_spider(cname)
                self.error_cnt += 1
                return
            result = CCIQ_AES("BF1856A312580D41256311147089E1CC").decrypt(c)
            try:
                dic = eval(result)
            except Exception as err:
                print "tid=%d --- cnt=%d --- cname=%s --- retry=%d --- res.code=%d  --- now_page:%d --- exception result:%s" % (
                    tid, cnt, cname, retry, res_code, now_page, result)
                r_result["type"] = "result_error"
                self.already_error_type.append(spider.util.utf8str(r_result))
                self.record_spider(cname)
                self.error_cnt += 1
                return
            list = dic['list']
            if len(list) == 0:
                print "tid=%d --- cnt=%d --- cname=%s --- retry=%d --- res.code=%d --- now_page:%d --- exception len(list)=0" % (
                    tid, cnt, cname, retry, res_code, now_page)
                r_result["type"] = "list=0"
                self.already_error_type.append(spider.util.utf8str(r_result))
                self.record_spider(cname)
                self.error_cnt += 1
                return
            #print "tid=%d ### cnt=%d ### cname=%s ### retry=%d ### res.code=%d ### now_page:%d ### success:len(list):%d " % (tid, cnt, cname, retry, res_code, now_page, len(list))
            for l in list:
                aa = {"query_name": cname}
                for k, v in l.items():
                    aa[k] = v
                self.query_company_list.append(spider.util.utf8str(aa))
            print "******", len(list), spider.util.utf8str(list)
            if len(list) < 20:
                # r_result["type"] = "success"
                # self.already_error_type.append(spider.util.utf8str(r_result))
                self.record_spider(cname)
                return
            elif len(list) == 20:
                if now_page > 100:
                    self.already_error_type.append(
                        spider.util.utf8str(r_result))
                    self.record_spider(cname)
                    return
                now_page += 1
                self.flip_over(now_page, cname, cnt, retry)
        else:
            print "tid=%d --- cnt=%d --- cname=%s --- retry=%d --- res.code=%d --- now_page:%d --- exception UNKNOW ERROR" % (
                tid, cnt, cname, retry, res_code, now_page)
            if retry < 3:
                self.re_add_job({
                    'cname': cname,
                    'cnt': cnt,
                    'retry': (retry + 1)
                })
                return
            r_result["type"] = "unknown_error"
            self.already_error_type.append(spider.util.utf8str(r_result))
            self.record_spider(cname)
            return

    def get_fail_cnt(self, type_key, addv):
        fc = getattr(self._curltls, type_key, 0)
        if (addv):
            fc += addv
            setattr(self._curltls, type_key, fc)
        return fc

    def event_handler(self, evt, msg, **kwargs):
        if evt == 'DONE':
            msg += '企业查询宝APP公司列表查询已经停止...错误数:' + str(self.error_cnt)
            spider.util.sendmail('*****@*****.**', '%s DONE' % sys.argv[0],
                                 msg)

    def read_proxy(self, fn):
        with open(fn, 'r') as f:
            for line in f:
                line = line.strip()
                self._match_proxy(line)
        self._can_use_proxy_num = len(self.proxies_dict)
        print " loaded [ %d ] proxis " % self._can_use_proxy_num

    def _match_proxy(self, line):
        m = re.match('([0-9.]+):(\d+):([a-z0-9]+):([a-z0-9._-]+)$', line, re.I)
        m1 = re.match('([0-9.]+):(\d+):([a-z0-9]+)$', line, re.I)
        if m:
            prstr = '%s:%s@%s:%s' % (m.group(3), m.group(4), m.group(1),
                                     m.group(2))
            proxies = {'http': 'http://' + prstr, 'https': 'https://' + prstr}
        elif m1:
            prstr = '%s:%s' % (m1.group(1), m1.group(2))
            proxies = {'http': 'http://' + prstr, 'https': 'https://' + prstr}
        else:
            proxies = {'http': 'http://' + line, 'https': 'https://' + line}
        self.proxies_dict.append(proxies)
Пример #28
0
    def run_job(self, jobid):
        code = jobid.get("code")
        retry = jobid.get("retry")

        tid = self.get_tid()
        url = "http://appsvc.qiye.qianzhan.com/OrgCompany.svc/orgcompany/combine/detail"
        encryptedJson = {
            "bl_oc_code": code,  #code,  #"71526726X"
            "v1": "QZOrgV005",
            "isDirect": "0",
            "bl_oc_name": "腾讯科技",  #cname,  #"腾讯科技"
            "bl_oc_area": ""  #area #"4403"
        }
        detail = {}
        res = self.req_all(url, encryptedJson)
        res_code = 0
        if res is None:
            print code, "get detail     res is None !!"
            return
        res_code = res.code
        if res_code == 400:
            self.result400.append(code)
            self.req_cnt += 1
            return
        try:
            if u"服务不可用。" in res.text or u"Unauthorized!" in res.text:  # or u"处理请求时服务器遇到错误。有关详细信息,请参见服务器日志。" in res.text:
                self.re_add_job({'cname': code, 'retry': retry})
                print "系统不可用...", code, res.text
                return
            c = eval(res.text)['c']
        except Exception as err:
            print "tid=%d --- retry=%d --- res.code=%d  exception " % (
                tid, retry,
                res_code), err  #res.text=%s#, spider.util.utf8str(res.text)
            self.re_add_job({'cname': code, 'retry': retry})
            return
        if len(c) == 0:
            print "tid=%d --- retry=%d --- res.code=%d   --- exception 'C' IS NULL" % (
                tid, retry, res_code)
            self.query_failure.append(code + ",c=0")
            self.record_spider(code)
            return
        result = CCIQ_AES("BF1856A312580D41256311147089E1CC").decrypt(c)
        try:
            detail = eval(result)
        except Exception as err:
            print "tid=%d --- retry=%d --- res.code=%d  --- exception result:%s" % (
                tid, retry, res_code, result)
            self.query_failure.append(code + ",result_error")
            self.record_spider(code)
            return

        cname = None
        try:
            basic = detail["list"]
            if basic is None or len(basic) == 0:
                print code, " 此码无效...", spider.util.utf8str(detail)
                self.query_failure.append(code + ",list=0")
                self.record_spider(code)
                return
            cname = basic[0]["oc_name"]
        except Exception as err:
            print code, "获取基本详情错误,拿不到oc_name,detail : ", spider.util.utf8str(
                detail)
            return

        #股东信息
        # listGD = self.get_gd(code)
        # if listGD is not None:
        #     #print "tid=", tid, " listGD=", spider.util.utf8str(listGD)
        #     detail['listGD'] = listGD['listGD']

        #投资信息
        # list_inversted = self.get_inversted(cname)
        # if list_inversted is not None:
        #     #print "tid=", tid, " list_inversted=", spider.util.utf8str(list_inversted)
        #     detail['inversted'] = list_inversted['inversted']

        # #获取分支机构信息
        # branch = []
        # list_branch = self.get_branch(cname, list_branch=branch)
        # if list_branch is not None:
        #     #print "tid=", tid, " list_branch=", spider.util.utf8str(list_branch)
        #     detail['Branch'] = list_branch #['Branch']
        self.query_success.append(spider.util.utf8str(detail))
        self.record_spider(code)

        print "tid=%d --- retry=%d --- res.code=%d  @@@ success: %s \n " % (
            tid, retry, res_code, spider.util.utf8str(
                self.proxies)), spider.util.utf8str(detail)
Пример #29
0
    def flip_over(self, now_page, cname, cnt, retry):
        tid = self.get_tid()
        """
        根据公司名查询公司列表,翻页
        """
        encryptedJson = {
            "pagesize": "20",
            "page": now_page,
            "od_orderBy": "0",
            "sh_searchType": "一般搜索",
            "sh_oc_areaName": "",
            "od_statusFilter": "0",
            "v1": "QZOrgV005",
            "oc_name": cname,
            "sh_u_uid": "",
            "sh_u_name": ""
        }
        r_result = {"cname": cname}
        res = self.req_all(encryptedJson, cname=cname)
        res_code = 0
        if res is None:
            self.re_add_job({'cname': cname, 'cnt': cnt, 'retry': retry})
            return
        if u"处理请求时服务器遇到错误。有关详细信息,请参见服务器日志" in res.text:
            print "处理请求时服务器遇到错误。有关详细信息,请参见服务器日志..."
            if retry < 3:
                self.re_add_job({
                    'cname': cname,
                    'cnt': cnt,
                    'retry': (retry + 1)
                })
                return
            else:
                r_result["type"] = "request-server-error"
                self.already_error_type.append(spider.util.utf8str(r_result))
                self.record_spider(cname)
                return

        try:
            c = eval(res.text)['c']
        except Exception as err:
            print "tid=%d --- cnt=%d --- cname=%s --- retry=%d --- res.code=%d  --- now_page:%d --- exception res.text = %s" % (
                tid, cnt, cname, retry, res_code, now_page,
                spider.util.utf8str(res.text))
            if retry < 3:
                self.re_add_job({
                    'cname': cname,
                    'cnt': cnt,
                    'retry': (retry + 1)
                })
            else:
                r_result["type"] = "res.text=invalid"
                self.already_error_type.append(spider.util.utf8str(r_result))
                self.record_spider(cname)
            return
        if len(c) == 0:
            print "tid=%d --- cnt=%d --- cname=%s --- retry=%d --- res.code=%d  --- now_page:%d --- exception 'C' IS NULL" % (
                tid, cnt, cname, retry, res_code, now_page)
            r_result["type"] = "c=0"
            self.already_error_type.append(spider.util.utf8str(r_result))
            self.record_spider(cname)
            self.error_cnt += 1
            return
        result = CCIQ_AES("BF1856A312580D41256311147089E1CC").decrypt(c)
        try:
            dic = eval(result)
        except Exception as err:
            print "tid=%d --- cnt=%d --- cname=%s --- retry=%d --- res.code=%d  --- now_page:%d --- exception result:%s" % (
                tid, cnt, cname, retry, res_code, now_page, result)
            r_result["type"] = "result_error"
            self.already_error_type.append(spider.util.utf8str(r_result))
            self.record_spider(cname)
            self.error_cnt += 1
            return
        list = dic['list']
        if len(list) == 0:
            print "tid=%d --- cnt=%d --- cname=%s --- retry=%d --- res.code=%d --- now_page:%d --- exception len(list)=0" % (
                tid, cnt, cname, retry, res_code, now_page)
            r_result["type"] = "list=0"
            self.already_error_type.append(spider.util.utf8str(r_result))
            self.record_spider(cname)
            self.error_cnt += 1
            return
        for l in list:
            aa = {"query_name": cname}
            for k, v in l.items():
                aa[k] = v
            self.query_company_list.append(spider.util.utf8str(aa))
        print "******", len(list), spider.util.utf8str(list)
        if len(list) < 20:
            self.record_spider(cname)
            return
        elif len(list) == 20:
            if now_page > 2:
                self.already_error_type.append(spider.util.utf8str(r_result))
                self.record_spider(cname)
                return
            now_page += 1
            self.flip_over(now_page, cname, cnt, retry)
Пример #30
0
    def get_detail(self, cname, code, area):
        """
        查询某公司详细信息
        """
        url = "http://appsvc.qiye.qianzhan.com/OrgCompany.svc/orgcompany/combine/detail"
        encryptedJson = {
            "bl_oc_code": code,  #"71526726X"
            "v1": "QZOrgV004",
            "isDirect": "1",
            "bl_oc_name": cname,  #"腾讯科技"
            "bl_oc_area": area  #"4403"
        }

        param = spider.util.utf8str({
            "encryptedJson":
            self._aes_.encrypt(spider.util.utf8str(encryptedJson)),
            "extJson":
            self.extJson
        })
        res = self.request_url(url,
                               headers=self.headers,
                               data=param,
                               proxies=self.proxies_dict[self.get_tid()])

        if res is None:
            print 'get_detail ------ res is none ,---->cname=', cname
            self.detail_failure.append(cname + "|" + str(code) + "|" +
                                       str(area))
            return
        elif res.code == 404:
            print "get_detail ------ 404 ------ ", cname, code
            self.detail_failure.append(cname + "|" + str(code) + "|" +
                                       str(area))
            return
        elif res.code == 503 or res.code == 500 or res.code == 502 or res.code == 504:
            print 'get_detail ------ ', res.code, cname, code
            time.sleep(0.5)
            self.get_detail(cname, code, area)
            return
        elif res.code == 200:
            c = eval(res.text)['c']
            if len(c) == 0:
                print '-----------------------------code ', code, ' res.text is null----------------------------'
                self.detail_failure.append(cname + "|" + str(code) + "|" +
                                           str(area))
                return
            result = CCIQ_AES("BB1856A312580D41256311147089E0CC").decrypt(c)
            detail = eval(result)
            #获取股东信息
            listGD = self.get_gd(area, code)
            if listGD is not None:
                detail['listGD'] = listGD['listGD']

            #获取投资信息
            list_inversted = self.get_inversted(cname)
            if list_inversted is not None:
                detail['inversted'] = list_inversted['inversted']

            #获取分支机构信息
            list_branch = self.get_branch(cname, 1, {"Branch": []})
            if list_branch is not None:
                detail['Branch'] = list_branch['Branch']

            print 'detail=================================', spider.util.utf8str(
                detail)
            self.detail_company.append(spider.util.utf8str(detail))
            return
        else:
            print "cname %s #######################################UNKNOWN ERROR############################################# [ %d ]" % (
                cname, res.code)